Index: llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -47,12 +47,17 @@ DominatorTree *DT; const GCNSubtarget *ST; bool IsPixelShader; + bool IsComputeKernel; Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V, Value *const Identity) const; Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const; + std::pair + buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op, + Value *const Identity, Value *const Ballot, Value *V, + Instruction &I) const; void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx, bool ValDivergent) const; @@ -93,6 +98,7 @@ const TargetMachine &TM = TPC.getTM(); ST = &TM.getSubtarget(F); IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS; + IsComputeKernel = F.getCallingConv() == CallingConv::AMDGPU_KERNEL; visit(F); @@ -430,6 +436,73 @@ return V; } +// Use the builder to create an exclusive scan and compute the final reduced +// value using an iterative approach. This provides an alternative +// implementation to DPP which uses WMM for scan computations. This API iterate +// over lanes to read, compute and update (conditionally) the value using +// readlane and writelane intrinsics. +std::pair AMDGPUAtomicOptimizer::buildScanIteratively( + IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, + Value *const Ballot, Value *V, Instruction &I) const { + Type *const Ty = I.getType(); + const bool NeedResult = !I.use_empty(); + Module *M = B.GetInsertBlock()->getModule(); + Function *WriteLaneDecl = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + Value *ReducedValue = Identity; + Value *PartialSumTillPreviousLane = Identity; + BasicBlock *CurrentBasicBlock = nullptr; + PHINode *DestWrite = nullptr; + Value *Scan = V; + Value *DestWriteOfPreviousLane = V; + Instruction *TerminatorInConditionalWritelaneBlock = nullptr; + unsigned WaveFrontSize = ST->isWave32() ? 32 : 64; + Type *const BallotTy = Ballot->getType(); + const unsigned TyBitWidth = DL->getTypeSizeInBits(BallotTy); + + for (unsigned LaneIdx = 0; LaneIdx < WaveFrontSize; LaneIdx++) { + // Iterate over all the lanes of a wavefront to compute the partial sum. If + // the lane is not active select the Identity value in computation, + // otherwise consider value extracted using readlane. + Value *Mask = + B.CreateShl(B.getIntN(TyBitWidth, 1), B.getIntN(TyBitWidth, LaneIdx)); + Value *BallotAndMask = B.CreateAnd(Ballot, Mask); + Value *IsBitSet = B.CreateICmpEQ(BallotAndMask, Mask); + Value *Select = + B.CreateSelect(IsBitSet, + B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {V, B.getInt32(LaneIdx)}), + Identity); + ReducedValue = buildNonAtomicBinOp(B, Op, ReducedValue, Select); + + // Perform the writelane conditionally on only active lanes if the + // intermidiate scan results are required. + if (NeedResult) { + CurrentBasicBlock = I.getParent(); + // Split the current basic block into IfThen to perform writelane + // conditionally on active lanes only + TerminatorInConditionalWritelaneBlock = + SplitBlockAndInsertIfThen(IsBitSet, &I, false, nullptr, DT, nullptr); + + // Write exclusive scan (partial sum till the previous lane) result into a + // current lane + B.SetInsertPoint(TerminatorInConditionalWritelaneBlock); + Scan = B.CreateCall(WriteLaneDecl, {PartialSumTillPreviousLane, + B.getInt32(LaneIdx), Scan}); + + B.SetInsertPoint(&I); + DestWrite = B.CreatePHI(Ty, 2, "DestWrite"); + DestWrite->addIncoming(DestWriteOfPreviousLane, CurrentBasicBlock); + DestWrite->addIncoming( + Scan, TerminatorInConditionalWritelaneBlock->getParent()); + // Values used for the next iteration + DestWriteOfPreviousLane = DestWrite; + PartialSumTillPreviousLane = ReducedValue; + } + } + return std::make_pair(Scan, ReducedValue); +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -531,33 +604,39 @@ // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { - // First we need to set all inactive invocations to the identity value, so - // that they can correctly contribute to the final result. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); - const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; - if (!NeedResult && ST->hasPermLaneX16()) { - // On GFX10 the permlanex16 instruction helps us build a reduction without - // too many readlanes and writelanes, which are generally bad for - // performance. - NewV = buildReduction(B, ScanOp, NewV, Identity); + if (!IsComputeKernel) { + // First we need to set all inactive invocations to the identity value, so + // that they can correctly contribute to the final result. + NewV = + B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity}); + const AtomicRMWInst::BinOp ScanOp = + Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; + if (!NeedResult && ST->hasPermLaneX16()) { + // On GFX10 the permlanex16 instruction helps us build a reduction + // without too many readlanes and writelanes, which are generally bad + // for performance. + NewV = buildReduction(B, ScanOp, NewV, Identity); + } else { + NewV = buildScan(B, ScanOp, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); + // Read the value from the last lane, which has accumulated the values + // of each active lane in the wavefront. This will be our new value + // which we will provide to the atomic operation. + Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); + assert(TyBitWidth == 32); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {NewV, LastLaneIdx}); + } + // Finally mark the readlanes in the WWM section. + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { - NewV = buildScan(B, ScanOp, NewV, Identity); - if (NeedResult) - ExclScan = buildShiftRight(B, NewV, Identity); - - // Read the value from the last lane, which has accumulated the values of - // each active lane in the wavefront. This will be our new value which we - // will provide to the atomic operation. - Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - assert(TyBitWidth == 32); - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, LastLaneIdx}); + // Alternative implementation for scan + std::tie(ExclScan, NewV) = + buildScanIteratively(B, ScanOp, Identity, Ballot, V, I); } - - // Finally mark the readlanes in the WWM section. - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); } else { switch (Op) { default: @@ -594,11 +673,19 @@ } } - // We only want a single lane to enter our new control flow, and we do this - // by checking if there are any active lanes below us. Only one lane will - // have 0 active lanes below us, so that will be the only one to progress. - Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); - + Value *Cond = nullptr; + if (ValDivergent && IsComputeKernel) { + // Only the first active lane will enter the new control flow to update the + // value. + CallInst *const FirstActiveLane = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, Mbcnt); + Cond = B.CreateICmpEQ(Mbcnt, FirstActiveLane); + } else { + // We only want a single lane to enter our new control flow, and we do this + // by checking if there are any active lanes below us. Only one lane will + // have 0 active lanes below us, so that will be the only one to progress. + Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0)); + } // Store I's original basic block before we split the block. BasicBlock *const EntryBB = I.getParent(); @@ -660,8 +747,12 @@ // from the first lane, to get our lane's index into the atomic result. Value *LaneOffset = nullptr; if (ValDivergent) { - LaneOffset = - B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + if (!IsComputeKernel) { + LaneOffset = + B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan); + } else { + LaneOffset = ExclScan; + } } else { switch (Op) { default: Index: llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -270,11 +270,10 @@ cl::init(true), cl::Hidden); // Enable atomic optimization -static cl::opt EnableAtomicOptimizations( - "amdgpu-atomic-optimizations", - cl::desc("Enable atomic optimizations"), - cl::init(false), - cl::Hidden); +static cl::opt + EnableAtomicOptimizations("amdgpu-atomic-optimizations", + cl::desc("Enable atomic optimizations"), + cl::init(true), cl::Hidden); // Enable Mode register optimization static cl::opt EnableSIModeRegisterPass( Index: llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -902,14 +902,36 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_cbranch_execz .LBB32_4 +; GFX6-NEXT: ; %bb.1: +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_cbranch_execz .LBB32_3 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_movk_i32 s4, 0x3ffc +; GFX6-NEXT: s_movk_i32 s8, 0x3ffc ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, off, s[0:3], s4 glc +; GFX6-NEXT: buffer_atomic_add v1, off, s[0:3], s8 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: .LBB32_3: +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: .LBB32_4: ; %Flow +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; @@ -917,14 +939,36 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: ; implicit-def: $vgpr0 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX7-NEXT: s_cbranch_execz .LBB32_4 +; GFX7-NEXT: ; %bb.1: +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7-NEXT: s_cbranch_execz .LBB32_3 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: s_lshl_b32 s2, s2, 1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_movk_i32 s4, 0x3ffc +; GFX7-NEXT: s_movk_i32 s8, 0x3ffc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_add v0, off, s[0:3], s4 glc +; GFX7-NEXT: buffer_atomic_add v1, off, s[0:3], s8 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: .LBB32_3: +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: .LBB32_4: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4095 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst @@ -935,37 +979,81 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_offset4294967296(ptr addrspace(1) inreg %ptr) { ; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_mov_b32_e32 v0, 2 +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: ; implicit-def: $vgpr0 +; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_cbranch_execz .LBB33_4 +; GFX6-NEXT: ; %bb.1: +; GFX6-NEXT: s_mov_b64 s[2:3], exec +; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX6-NEXT: s_mov_b32 s8, 0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX6-NEXT: ; implicit-def: $vgpr1 +; GFX6-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX6-NEXT: s_cbranch_execz .LBB33_3 +; GFX6-NEXT: ; %bb.2: +; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX6-NEXT: s_lshl_b32 s2, s2, 1 +; GFX6-NEXT: s_mov_b32 s9, 4 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, s4 -; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: s_mov_b32 s2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX6-NEXT: buffer_atomic_add v1, v[2:3], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: .LBB33_3: +; GFX6-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX6-NEXT: v_readfirstlane_b32 s0, v1 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: .LBB33_4: ; %Flow +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_offset4294967296: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s4, 0 -; GFX7-NEXT: s_mov_b32 s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, 2 +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: ; implicit-def: $vgpr0 +; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; GFX7-NEXT: s_cbranch_execz .LBB33_4 +; GFX7-NEXT: ; %bb.1: +; GFX7-NEXT: s_mov_b64 s[2:3], exec +; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7-NEXT: s_mov_b32 s8, 0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: ; implicit-def: $vgpr1 +; GFX7-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX7-NEXT: s_cbranch_execz .LBB33_3 +; GFX7-NEXT: ; %bb.2: +; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7-NEXT: s_lshl_b32 s2, s2, 1 +; GFX7-NEXT: s_mov_b32 s9, 4 +; GFX7-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b32 s2, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: s_mov_b32 s2, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc +; GFX7-NEXT: buffer_atomic_add v1, v[2:3], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: buffer_wbinvl1 +; GFX7-NEXT: .LBB33_3: +; GFX7-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX7-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7-NEXT: .LBB33_4: ; %Flow +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4294967296 %result = atomicrmw add ptr addrspace(1) %gep, i32 2 seq_cst Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -449,313 +449,5026 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB2_3 +; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB2_6 +; GFX8-NEXT: .LBB2_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB2_9 +; GFX8-NEXT: .LBB2_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB2_12 +; GFX8-NEXT: .LBB2_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB2_15 +; GFX8-NEXT: .LBB2_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB2_18 +; GFX8-NEXT: .LBB2_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB2_21 +; GFX8-NEXT: .LBB2_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB2_24 +; GFX8-NEXT: .LBB2_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB2_27 +; GFX8-NEXT: .LBB2_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB2_30 +; GFX8-NEXT: .LBB2_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB2_33 +; GFX8-NEXT: .LBB2_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB2_36 +; GFX8-NEXT: .LBB2_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB2_39 +; GFX8-NEXT: .LBB2_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB2_42 +; GFX8-NEXT: .LBB2_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB2_45 +; GFX8-NEXT: .LBB2_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB2_48 +; GFX8-NEXT: .LBB2_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB2_51 +; GFX8-NEXT: .LBB2_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB2_54 +; GFX8-NEXT: .LBB2_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB2_57 +; GFX8-NEXT: .LBB2_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB2_60 +; GFX8-NEXT: .LBB2_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB2_63 +; GFX8-NEXT: .LBB2_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB2_66 +; GFX8-NEXT: .LBB2_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB2_69 +; GFX8-NEXT: .LBB2_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB2_72 +; GFX8-NEXT: .LBB2_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB2_75 +; GFX8-NEXT: .LBB2_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB2_78 +; GFX8-NEXT: .LBB2_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB2_81 +; GFX8-NEXT: .LBB2_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB2_84 +; GFX8-NEXT: .LBB2_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB2_87 +; GFX8-NEXT: .LBB2_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB2_90 +; GFX8-NEXT: .LBB2_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB2_93 +; GFX8-NEXT: .LBB2_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB2_96 +; GFX8-NEXT: .LBB2_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB2_99 +; GFX8-NEXT: .LBB2_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB2_102 +; GFX8-NEXT: .LBB2_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB2_105 +; GFX8-NEXT: .LBB2_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB2_108 +; GFX8-NEXT: .LBB2_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB2_111 +; GFX8-NEXT: .LBB2_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB2_114 +; GFX8-NEXT: .LBB2_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB2_117 +; GFX8-NEXT: .LBB2_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB2_120 +; GFX8-NEXT: .LBB2_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB2_123 +; GFX8-NEXT: .LBB2_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB2_126 +; GFX8-NEXT: .LBB2_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB2_129 +; GFX8-NEXT: .LBB2_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB2_132 +; GFX8-NEXT: .LBB2_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB2_135 +; GFX8-NEXT: .LBB2_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB2_138 +; GFX8-NEXT: .LBB2_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB2_141 +; GFX8-NEXT: .LBB2_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB2_144 +; GFX8-NEXT: .LBB2_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB2_147 +; GFX8-NEXT: .LBB2_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB2_150 +; GFX8-NEXT: .LBB2_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB2_153 +; GFX8-NEXT: .LBB2_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB2_156 +; GFX8-NEXT: .LBB2_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB2_159 +; GFX8-NEXT: .LBB2_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB2_162 +; GFX8-NEXT: .LBB2_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB2_165 +; GFX8-NEXT: .LBB2_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB2_168 +; GFX8-NEXT: .LBB2_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB2_171 +; GFX8-NEXT: .LBB2_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB2_174 +; GFX8-NEXT: .LBB2_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB2_177 +; GFX8-NEXT: .LBB2_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB2_180 +; GFX8-NEXT: .LBB2_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB2_183 +; GFX8-NEXT: .LBB2_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB2_186 +; GFX8-NEXT: .LBB2_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB2_189 +; GFX8-NEXT: .LBB2_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB2_192 +; GFX8-NEXT: .LBB2_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_cbranch_execz .LBB2_194 +; GFX8-NEXT: ; %bb.193: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB2_3 +; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB2_6 +; GFX9-NEXT: .LBB2_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB2_9 +; GFX9-NEXT: .LBB2_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB2_12 +; GFX9-NEXT: .LBB2_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB2_15 +; GFX9-NEXT: .LBB2_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB2_18 +; GFX9-NEXT: .LBB2_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB2_21 +; GFX9-NEXT: .LBB2_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB2_24 +; GFX9-NEXT: .LBB2_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB2_27 +; GFX9-NEXT: .LBB2_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB2_30 +; GFX9-NEXT: .LBB2_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB2_33 +; GFX9-NEXT: .LBB2_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB2_36 +; GFX9-NEXT: .LBB2_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB2_39 +; GFX9-NEXT: .LBB2_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB2_42 +; GFX9-NEXT: .LBB2_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB2_45 +; GFX9-NEXT: .LBB2_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB2_48 +; GFX9-NEXT: .LBB2_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB2_51 +; GFX9-NEXT: .LBB2_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB2_54 +; GFX9-NEXT: .LBB2_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB2_57 +; GFX9-NEXT: .LBB2_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB2_60 +; GFX9-NEXT: .LBB2_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB2_63 +; GFX9-NEXT: .LBB2_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB2_66 +; GFX9-NEXT: .LBB2_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB2_69 +; GFX9-NEXT: .LBB2_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB2_72 +; GFX9-NEXT: .LBB2_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB2_75 +; GFX9-NEXT: .LBB2_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB2_78 +; GFX9-NEXT: .LBB2_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB2_81 +; GFX9-NEXT: .LBB2_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB2_84 +; GFX9-NEXT: .LBB2_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB2_87 +; GFX9-NEXT: .LBB2_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB2_90 +; GFX9-NEXT: .LBB2_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB2_93 +; GFX9-NEXT: .LBB2_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB2_96 +; GFX9-NEXT: .LBB2_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB2_99 +; GFX9-NEXT: .LBB2_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB2_102 +; GFX9-NEXT: .LBB2_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB2_105 +; GFX9-NEXT: .LBB2_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB2_108 +; GFX9-NEXT: .LBB2_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB2_111 +; GFX9-NEXT: .LBB2_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB2_114 +; GFX9-NEXT: .LBB2_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB2_117 +; GFX9-NEXT: .LBB2_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB2_120 +; GFX9-NEXT: .LBB2_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB2_123 +; GFX9-NEXT: .LBB2_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB2_126 +; GFX9-NEXT: .LBB2_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB2_129 +; GFX9-NEXT: .LBB2_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB2_132 +; GFX9-NEXT: .LBB2_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB2_135 +; GFX9-NEXT: .LBB2_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB2_138 +; GFX9-NEXT: .LBB2_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB2_141 +; GFX9-NEXT: .LBB2_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB2_144 +; GFX9-NEXT: .LBB2_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB2_147 +; GFX9-NEXT: .LBB2_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB2_150 +; GFX9-NEXT: .LBB2_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB2_153 +; GFX9-NEXT: .LBB2_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB2_156 +; GFX9-NEXT: .LBB2_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB2_159 +; GFX9-NEXT: .LBB2_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB2_162 +; GFX9-NEXT: .LBB2_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB2_165 +; GFX9-NEXT: .LBB2_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB2_168 +; GFX9-NEXT: .LBB2_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB2_171 +; GFX9-NEXT: .LBB2_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB2_174 +; GFX9-NEXT: .LBB2_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB2_177 +; GFX9-NEXT: .LBB2_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB2_180 +; GFX9-NEXT: .LBB2_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB2_183 +; GFX9-NEXT: .LBB2_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB2_186 +; GFX9-NEXT: .LBB2_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB2_189 +; GFX9-NEXT: .LBB2_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB2_192 +; GFX9-NEXT: .LBB2_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_cbranch_execz .LBB2_194 +; GFX9-NEXT: ; %bb.193: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX10W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W64-NEXT: s_branch .LBB2_3 +; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_3: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX10W64-NEXT: ; %bb.4: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX10W64-NEXT: s_branch .LBB2_6 +; GFX10W64-NEXT: .LBB2_5: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_6: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX10W64-NEXT: ; %bb.7: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX10W64-NEXT: s_branch .LBB2_9 +; GFX10W64-NEXT: .LBB2_8: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_9: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX10W64-NEXT: ; %bb.10: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX10W64-NEXT: s_branch .LBB2_12 +; GFX10W64-NEXT: .LBB2_11: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_12: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX10W64-NEXT: ; %bb.13: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX10W64-NEXT: s_branch .LBB2_15 +; GFX10W64-NEXT: .LBB2_14: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_15: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX10W64-NEXT: ; %bb.16: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX10W64-NEXT: s_branch .LBB2_18 +; GFX10W64-NEXT: .LBB2_17: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_18: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX10W64-NEXT: ; %bb.19: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX10W64-NEXT: s_branch .LBB2_21 +; GFX10W64-NEXT: .LBB2_20: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_21: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX10W64-NEXT: ; %bb.22: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX10W64-NEXT: s_branch .LBB2_24 +; GFX10W64-NEXT: .LBB2_23: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_24: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX10W64-NEXT: ; %bb.25: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX10W64-NEXT: s_branch .LBB2_27 +; GFX10W64-NEXT: .LBB2_26: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_27: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX10W64-NEXT: ; %bb.28: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX10W64-NEXT: s_branch .LBB2_30 +; GFX10W64-NEXT: .LBB2_29: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_30: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX10W64-NEXT: ; %bb.31: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX10W64-NEXT: s_branch .LBB2_33 +; GFX10W64-NEXT: .LBB2_32: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_33: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX10W64-NEXT: ; %bb.34: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX10W64-NEXT: s_branch .LBB2_36 +; GFX10W64-NEXT: .LBB2_35: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_36: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX10W64-NEXT: ; %bb.37: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX10W64-NEXT: s_branch .LBB2_39 +; GFX10W64-NEXT: .LBB2_38: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_39: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX10W64-NEXT: ; %bb.40: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX10W64-NEXT: s_branch .LBB2_42 +; GFX10W64-NEXT: .LBB2_41: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_42: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX10W64-NEXT: ; %bb.43: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX10W64-NEXT: s_branch .LBB2_45 +; GFX10W64-NEXT: .LBB2_44: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_45: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX10W64-NEXT: ; %bb.46: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX10W64-NEXT: s_branch .LBB2_48 +; GFX10W64-NEXT: .LBB2_47: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_48: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX10W64-NEXT: ; %bb.49: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX10W64-NEXT: s_branch .LBB2_51 +; GFX10W64-NEXT: .LBB2_50: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_51: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX10W64-NEXT: ; %bb.52: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX10W64-NEXT: s_branch .LBB2_54 +; GFX10W64-NEXT: .LBB2_53: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_54: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX10W64-NEXT: ; %bb.55: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX10W64-NEXT: s_branch .LBB2_57 +; GFX10W64-NEXT: .LBB2_56: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_57: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX10W64-NEXT: ; %bb.58: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX10W64-NEXT: s_branch .LBB2_60 +; GFX10W64-NEXT: .LBB2_59: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_60: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX10W64-NEXT: ; %bb.61: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX10W64-NEXT: s_branch .LBB2_63 +; GFX10W64-NEXT: .LBB2_62: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_63: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX10W64-NEXT: ; %bb.64: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX10W64-NEXT: s_branch .LBB2_66 +; GFX10W64-NEXT: .LBB2_65: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_66: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX10W64-NEXT: ; %bb.67: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX10W64-NEXT: s_branch .LBB2_69 +; GFX10W64-NEXT: .LBB2_68: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_69: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX10W64-NEXT: ; %bb.70: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX10W64-NEXT: s_branch .LBB2_72 +; GFX10W64-NEXT: .LBB2_71: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_72: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX10W64-NEXT: ; %bb.73: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX10W64-NEXT: s_branch .LBB2_75 +; GFX10W64-NEXT: .LBB2_74: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_75: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX10W64-NEXT: ; %bb.76: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX10W64-NEXT: s_branch .LBB2_78 +; GFX10W64-NEXT: .LBB2_77: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_78: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX10W64-NEXT: ; %bb.79: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX10W64-NEXT: s_branch .LBB2_81 +; GFX10W64-NEXT: .LBB2_80: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_81: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX10W64-NEXT: ; %bb.82: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX10W64-NEXT: s_branch .LBB2_84 +; GFX10W64-NEXT: .LBB2_83: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_84: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX10W64-NEXT: ; %bb.85: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX10W64-NEXT: s_branch .LBB2_87 +; GFX10W64-NEXT: .LBB2_86: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_87: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX10W64-NEXT: ; %bb.88: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX10W64-NEXT: s_branch .LBB2_90 +; GFX10W64-NEXT: .LBB2_89: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_90: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX10W64-NEXT: s_add_i32 s4, s6, s2 +; GFX10W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX10W64-NEXT: s_mov_b32 s7, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX10W64-NEXT: ; %bb.91: +; GFX10W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX10W64-NEXT: s_branch .LBB2_93 +; GFX10W64-NEXT: .LBB2_92: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_93: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s5, 0 +; GFX10W64-NEXT: s_add_i32 s6, s4, s2 +; GFX10W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX10W64-NEXT: ; %bb.94: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX10W64-NEXT: s_branch .LBB2_96 +; GFX10W64-NEXT: .LBB2_95: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_96: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX10W64-NEXT: ; %bb.97: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX10W64-NEXT: s_branch .LBB2_99 +; GFX10W64-NEXT: .LBB2_98: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_99: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX10W64-NEXT: ; %bb.100: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX10W64-NEXT: s_branch .LBB2_102 +; GFX10W64-NEXT: .LBB2_101: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_102: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX10W64-NEXT: ; %bb.103: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX10W64-NEXT: s_branch .LBB2_105 +; GFX10W64-NEXT: .LBB2_104: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_105: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX10W64-NEXT: ; %bb.106: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX10W64-NEXT: s_branch .LBB2_108 +; GFX10W64-NEXT: .LBB2_107: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_108: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX10W64-NEXT: ; %bb.109: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX10W64-NEXT: s_branch .LBB2_111 +; GFX10W64-NEXT: .LBB2_110: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_111: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX10W64-NEXT: ; %bb.112: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX10W64-NEXT: s_branch .LBB2_114 +; GFX10W64-NEXT: .LBB2_113: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_114: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX10W64-NEXT: ; %bb.115: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX10W64-NEXT: s_branch .LBB2_117 +; GFX10W64-NEXT: .LBB2_116: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_117: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX10W64-NEXT: ; %bb.118: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX10W64-NEXT: s_branch .LBB2_120 +; GFX10W64-NEXT: .LBB2_119: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_120: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX10W64-NEXT: ; %bb.121: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX10W64-NEXT: s_branch .LBB2_123 +; GFX10W64-NEXT: .LBB2_122: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_123: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX10W64-NEXT: ; %bb.124: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX10W64-NEXT: s_branch .LBB2_126 +; GFX10W64-NEXT: .LBB2_125: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_126: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX10W64-NEXT: ; %bb.127: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX10W64-NEXT: s_branch .LBB2_129 +; GFX10W64-NEXT: .LBB2_128: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_129: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX10W64-NEXT: ; %bb.130: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX10W64-NEXT: s_branch .LBB2_132 +; GFX10W64-NEXT: .LBB2_131: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_132: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX10W64-NEXT: ; %bb.133: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX10W64-NEXT: s_branch .LBB2_135 +; GFX10W64-NEXT: .LBB2_134: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_135: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX10W64-NEXT: ; %bb.136: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX10W64-NEXT: s_branch .LBB2_138 +; GFX10W64-NEXT: .LBB2_137: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_138: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX10W64-NEXT: ; %bb.139: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX10W64-NEXT: s_branch .LBB2_141 +; GFX10W64-NEXT: .LBB2_140: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_141: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX10W64-NEXT: ; %bb.142: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX10W64-NEXT: s_branch .LBB2_144 +; GFX10W64-NEXT: .LBB2_143: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_144: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX10W64-NEXT: ; %bb.145: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX10W64-NEXT: s_branch .LBB2_147 +; GFX10W64-NEXT: .LBB2_146: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_147: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX10W64-NEXT: ; %bb.148: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX10W64-NEXT: s_branch .LBB2_150 +; GFX10W64-NEXT: .LBB2_149: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_150: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX10W64-NEXT: ; %bb.151: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX10W64-NEXT: s_branch .LBB2_153 +; GFX10W64-NEXT: .LBB2_152: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_153: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX10W64-NEXT: ; %bb.154: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX10W64-NEXT: s_branch .LBB2_156 +; GFX10W64-NEXT: .LBB2_155: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_156: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX10W64-NEXT: ; %bb.157: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX10W64-NEXT: s_branch .LBB2_159 +; GFX10W64-NEXT: .LBB2_158: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_159: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX10W64-NEXT: ; %bb.160: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX10W64-NEXT: s_branch .LBB2_162 +; GFX10W64-NEXT: .LBB2_161: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_162: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX10W64-NEXT: ; %bb.163: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX10W64-NEXT: s_branch .LBB2_165 +; GFX10W64-NEXT: .LBB2_164: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_165: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX10W64-NEXT: ; %bb.166: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX10W64-NEXT: s_branch .LBB2_168 +; GFX10W64-NEXT: .LBB2_167: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_168: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX10W64-NEXT: ; %bb.169: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX10W64-NEXT: s_branch .LBB2_171 +; GFX10W64-NEXT: .LBB2_170: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_171: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX10W64-NEXT: ; %bb.172: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX10W64-NEXT: s_branch .LBB2_174 +; GFX10W64-NEXT: .LBB2_173: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_174: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX10W64-NEXT: ; %bb.175: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX10W64-NEXT: s_branch .LBB2_177 +; GFX10W64-NEXT: .LBB2_176: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_177: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX10W64-NEXT: ; %bb.178: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX10W64-NEXT: s_branch .LBB2_180 +; GFX10W64-NEXT: .LBB2_179: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_180: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX10W64-NEXT: ; %bb.181: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX10W64-NEXT: s_branch .LBB2_183 +; GFX10W64-NEXT: .LBB2_182: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_183: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX10W64-NEXT: ; %bb.184: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX10W64-NEXT: s_branch .LBB2_186 +; GFX10W64-NEXT: .LBB2_185: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_186: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX10W64-NEXT: ; %bb.187: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX10W64-NEXT: s_branch .LBB2_189 +; GFX10W64-NEXT: .LBB2_188: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_189: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX10W64-NEXT: ; %bb.190: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX10W64-NEXT: s_branch .LBB2_192 +; GFX10W64-NEXT: .LBB2_191: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_192: +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_cbranch_execz .LBB2_194 +; GFX10W64-NEXT: ; %bb.193: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_add_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB2_194: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 +; GFX10W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX10W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, off, s[4:7], 0 glc +; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W32-NEXT: s_branch .LBB2_3 ; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_3: +; GFX10W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX10W32-NEXT: ; %bb.4: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX10W32-NEXT: s_branch .LBB2_6 +; GFX10W32-NEXT: .LBB2_5: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_6: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX10W32-NEXT: ; %bb.7: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX10W32-NEXT: s_branch .LBB2_9 +; GFX10W32-NEXT: .LBB2_8: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_9: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX10W32-NEXT: ; %bb.10: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX10W32-NEXT: s_branch .LBB2_12 +; GFX10W32-NEXT: .LBB2_11: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_12: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX10W32-NEXT: ; %bb.13: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX10W32-NEXT: s_branch .LBB2_15 +; GFX10W32-NEXT: .LBB2_14: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_15: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX10W32-NEXT: ; %bb.16: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX10W32-NEXT: s_branch .LBB2_18 +; GFX10W32-NEXT: .LBB2_17: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_18: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX10W32-NEXT: ; %bb.19: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX10W32-NEXT: s_branch .LBB2_21 +; GFX10W32-NEXT: .LBB2_20: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_21: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX10W32-NEXT: ; %bb.22: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX10W32-NEXT: s_branch .LBB2_24 +; GFX10W32-NEXT: .LBB2_23: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_24: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX10W32-NEXT: ; %bb.25: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX10W32-NEXT: s_branch .LBB2_27 +; GFX10W32-NEXT: .LBB2_26: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_27: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX10W32-NEXT: ; %bb.28: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX10W32-NEXT: s_branch .LBB2_30 +; GFX10W32-NEXT: .LBB2_29: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_30: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX10W32-NEXT: ; %bb.31: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX10W32-NEXT: s_branch .LBB2_33 +; GFX10W32-NEXT: .LBB2_32: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_33: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX10W32-NEXT: ; %bb.34: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX10W32-NEXT: s_branch .LBB2_36 +; GFX10W32-NEXT: .LBB2_35: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_36: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX10W32-NEXT: ; %bb.37: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX10W32-NEXT: s_branch .LBB2_39 +; GFX10W32-NEXT: .LBB2_38: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_39: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX10W32-NEXT: ; %bb.40: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX10W32-NEXT: s_branch .LBB2_42 +; GFX10W32-NEXT: .LBB2_41: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_42: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX10W32-NEXT: ; %bb.43: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX10W32-NEXT: s_branch .LBB2_45 +; GFX10W32-NEXT: .LBB2_44: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_45: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX10W32-NEXT: ; %bb.46: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX10W32-NEXT: s_branch .LBB2_48 +; GFX10W32-NEXT: .LBB2_47: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_48: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX10W32-NEXT: ; %bb.49: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX10W32-NEXT: s_branch .LBB2_51 +; GFX10W32-NEXT: .LBB2_50: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_51: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX10W32-NEXT: ; %bb.52: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX10W32-NEXT: s_branch .LBB2_54 +; GFX10W32-NEXT: .LBB2_53: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_54: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX10W32-NEXT: ; %bb.55: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX10W32-NEXT: s_branch .LBB2_57 +; GFX10W32-NEXT: .LBB2_56: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_57: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX10W32-NEXT: ; %bb.58: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX10W32-NEXT: s_branch .LBB2_60 +; GFX10W32-NEXT: .LBB2_59: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_60: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX10W32-NEXT: ; %bb.61: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX10W32-NEXT: s_branch .LBB2_63 +; GFX10W32-NEXT: .LBB2_62: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_63: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX10W32-NEXT: ; %bb.64: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX10W32-NEXT: s_branch .LBB2_66 +; GFX10W32-NEXT: .LBB2_65: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_66: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX10W32-NEXT: ; %bb.67: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX10W32-NEXT: s_branch .LBB2_69 +; GFX10W32-NEXT: .LBB2_68: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_69: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX10W32-NEXT: ; %bb.70: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX10W32-NEXT: s_branch .LBB2_72 +; GFX10W32-NEXT: .LBB2_71: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_72: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX10W32-NEXT: ; %bb.73: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX10W32-NEXT: s_branch .LBB2_75 +; GFX10W32-NEXT: .LBB2_74: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_75: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX10W32-NEXT: ; %bb.76: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX10W32-NEXT: s_branch .LBB2_78 +; GFX10W32-NEXT: .LBB2_77: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_78: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX10W32-NEXT: ; %bb.79: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX10W32-NEXT: s_branch .LBB2_81 +; GFX10W32-NEXT: .LBB2_80: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_81: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX10W32-NEXT: ; %bb.82: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX10W32-NEXT: s_branch .LBB2_84 +; GFX10W32-NEXT: .LBB2_83: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_84: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX10W32-NEXT: ; %bb.85: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX10W32-NEXT: s_branch .LBB2_87 +; GFX10W32-NEXT: .LBB2_86: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_87: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX10W32-NEXT: ; %bb.88: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX10W32-NEXT: s_branch .LBB2_90 +; GFX10W32-NEXT: .LBB2_89: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_90: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX10W32-NEXT: ; %bb.91: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX10W32-NEXT: s_branch .LBB2_93 +; GFX10W32-NEXT: .LBB2_92: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_93: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX10W32-NEXT: ; %bb.94: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX10W32-NEXT: s_branch .LBB2_96 +; GFX10W32-NEXT: .LBB2_95: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_96: +; GFX10W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_cbranch_execz .LBB2_98 +; GFX10W32-NEXT: ; %bb.97: +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: .LBB2_98: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX11W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W64-NEXT: s_branch .LBB2_3 +; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_3: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX11W64-NEXT: ; %bb.4: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX11W64-NEXT: s_branch .LBB2_6 +; GFX11W64-NEXT: .LBB2_5: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_6: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX11W64-NEXT: ; %bb.7: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX11W64-NEXT: s_branch .LBB2_9 +; GFX11W64-NEXT: .LBB2_8: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_9: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX11W64-NEXT: ; %bb.10: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX11W64-NEXT: s_branch .LBB2_12 +; GFX11W64-NEXT: .LBB2_11: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_12: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX11W64-NEXT: ; %bb.13: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX11W64-NEXT: s_branch .LBB2_15 +; GFX11W64-NEXT: .LBB2_14: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_15: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX11W64-NEXT: ; %bb.16: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX11W64-NEXT: s_branch .LBB2_18 +; GFX11W64-NEXT: .LBB2_17: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_18: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX11W64-NEXT: ; %bb.19: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX11W64-NEXT: s_branch .LBB2_21 +; GFX11W64-NEXT: .LBB2_20: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_21: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX11W64-NEXT: ; %bb.22: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX11W64-NEXT: s_branch .LBB2_24 +; GFX11W64-NEXT: .LBB2_23: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_24: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX11W64-NEXT: ; %bb.25: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX11W64-NEXT: s_branch .LBB2_27 +; GFX11W64-NEXT: .LBB2_26: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_27: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX11W64-NEXT: ; %bb.28: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX11W64-NEXT: s_branch .LBB2_30 +; GFX11W64-NEXT: .LBB2_29: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_30: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX11W64-NEXT: ; %bb.31: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX11W64-NEXT: s_branch .LBB2_33 +; GFX11W64-NEXT: .LBB2_32: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_33: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX11W64-NEXT: ; %bb.34: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX11W64-NEXT: s_branch .LBB2_36 +; GFX11W64-NEXT: .LBB2_35: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_36: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX11W64-NEXT: ; %bb.37: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX11W64-NEXT: s_branch .LBB2_39 +; GFX11W64-NEXT: .LBB2_38: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_39: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX11W64-NEXT: ; %bb.40: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX11W64-NEXT: s_branch .LBB2_42 +; GFX11W64-NEXT: .LBB2_41: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_42: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX11W64-NEXT: ; %bb.43: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX11W64-NEXT: s_branch .LBB2_45 +; GFX11W64-NEXT: .LBB2_44: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_45: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX11W64-NEXT: ; %bb.46: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX11W64-NEXT: s_branch .LBB2_48 +; GFX11W64-NEXT: .LBB2_47: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_48: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX11W64-NEXT: ; %bb.49: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX11W64-NEXT: s_branch .LBB2_51 +; GFX11W64-NEXT: .LBB2_50: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_51: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX11W64-NEXT: ; %bb.52: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX11W64-NEXT: s_branch .LBB2_54 +; GFX11W64-NEXT: .LBB2_53: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_54: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX11W64-NEXT: ; %bb.55: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX11W64-NEXT: s_branch .LBB2_57 +; GFX11W64-NEXT: .LBB2_56: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_57: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX11W64-NEXT: ; %bb.58: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX11W64-NEXT: s_branch .LBB2_60 +; GFX11W64-NEXT: .LBB2_59: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_60: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX11W64-NEXT: ; %bb.61: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX11W64-NEXT: s_branch .LBB2_63 +; GFX11W64-NEXT: .LBB2_62: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_63: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX11W64-NEXT: ; %bb.64: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX11W64-NEXT: s_branch .LBB2_66 +; GFX11W64-NEXT: .LBB2_65: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_66: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX11W64-NEXT: ; %bb.67: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX11W64-NEXT: s_branch .LBB2_69 +; GFX11W64-NEXT: .LBB2_68: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_69: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX11W64-NEXT: ; %bb.70: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX11W64-NEXT: s_branch .LBB2_72 +; GFX11W64-NEXT: .LBB2_71: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_72: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX11W64-NEXT: ; %bb.73: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX11W64-NEXT: s_branch .LBB2_75 +; GFX11W64-NEXT: .LBB2_74: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_75: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX11W64-NEXT: ; %bb.76: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX11W64-NEXT: s_branch .LBB2_78 +; GFX11W64-NEXT: .LBB2_77: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_78: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX11W64-NEXT: ; %bb.79: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX11W64-NEXT: s_branch .LBB2_81 +; GFX11W64-NEXT: .LBB2_80: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_81: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX11W64-NEXT: ; %bb.82: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX11W64-NEXT: s_branch .LBB2_84 +; GFX11W64-NEXT: .LBB2_83: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_84: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX11W64-NEXT: ; %bb.85: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX11W64-NEXT: s_branch .LBB2_87 +; GFX11W64-NEXT: .LBB2_86: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_87: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX11W64-NEXT: ; %bb.88: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX11W64-NEXT: s_branch .LBB2_90 +; GFX11W64-NEXT: .LBB2_89: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_90: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX11W64-NEXT: s_add_i32 s4, s6, s2 +; GFX11W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX11W64-NEXT: s_mov_b32 s7, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX11W64-NEXT: ; %bb.91: +; GFX11W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX11W64-NEXT: s_branch .LBB2_93 +; GFX11W64-NEXT: .LBB2_92: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_93: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s5, 0 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: s_add_i32 s6, s4, s2 +; GFX11W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX11W64-NEXT: ; %bb.94: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX11W64-NEXT: s_branch .LBB2_96 +; GFX11W64-NEXT: .LBB2_95: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_96: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX11W64-NEXT: ; %bb.97: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX11W64-NEXT: s_branch .LBB2_99 +; GFX11W64-NEXT: .LBB2_98: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_99: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX11W64-NEXT: ; %bb.100: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX11W64-NEXT: s_branch .LBB2_102 +; GFX11W64-NEXT: .LBB2_101: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_102: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX11W64-NEXT: ; %bb.103: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX11W64-NEXT: s_branch .LBB2_105 +; GFX11W64-NEXT: .LBB2_104: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_105: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX11W64-NEXT: ; %bb.106: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX11W64-NEXT: s_branch .LBB2_108 +; GFX11W64-NEXT: .LBB2_107: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_108: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX11W64-NEXT: ; %bb.109: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX11W64-NEXT: s_branch .LBB2_111 +; GFX11W64-NEXT: .LBB2_110: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_111: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX11W64-NEXT: ; %bb.112: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX11W64-NEXT: s_branch .LBB2_114 +; GFX11W64-NEXT: .LBB2_113: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_114: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX11W64-NEXT: ; %bb.115: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX11W64-NEXT: s_branch .LBB2_117 +; GFX11W64-NEXT: .LBB2_116: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_117: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX11W64-NEXT: ; %bb.118: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX11W64-NEXT: s_branch .LBB2_120 +; GFX11W64-NEXT: .LBB2_119: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_120: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX11W64-NEXT: ; %bb.121: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX11W64-NEXT: s_branch .LBB2_123 +; GFX11W64-NEXT: .LBB2_122: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_123: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX11W64-NEXT: ; %bb.124: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX11W64-NEXT: s_branch .LBB2_126 +; GFX11W64-NEXT: .LBB2_125: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_126: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX11W64-NEXT: ; %bb.127: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX11W64-NEXT: s_branch .LBB2_129 +; GFX11W64-NEXT: .LBB2_128: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_129: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX11W64-NEXT: ; %bb.130: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX11W64-NEXT: s_branch .LBB2_132 +; GFX11W64-NEXT: .LBB2_131: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_132: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX11W64-NEXT: ; %bb.133: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX11W64-NEXT: s_branch .LBB2_135 +; GFX11W64-NEXT: .LBB2_134: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_135: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX11W64-NEXT: ; %bb.136: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX11W64-NEXT: s_branch .LBB2_138 +; GFX11W64-NEXT: .LBB2_137: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_138: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX11W64-NEXT: ; %bb.139: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX11W64-NEXT: s_branch .LBB2_141 +; GFX11W64-NEXT: .LBB2_140: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_141: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX11W64-NEXT: ; %bb.142: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX11W64-NEXT: s_branch .LBB2_144 +; GFX11W64-NEXT: .LBB2_143: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_144: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX11W64-NEXT: ; %bb.145: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX11W64-NEXT: s_branch .LBB2_147 +; GFX11W64-NEXT: .LBB2_146: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_147: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX11W64-NEXT: ; %bb.148: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX11W64-NEXT: s_branch .LBB2_150 +; GFX11W64-NEXT: .LBB2_149: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_150: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX11W64-NEXT: ; %bb.151: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX11W64-NEXT: s_branch .LBB2_153 +; GFX11W64-NEXT: .LBB2_152: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_153: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX11W64-NEXT: ; %bb.154: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX11W64-NEXT: s_branch .LBB2_156 +; GFX11W64-NEXT: .LBB2_155: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_156: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX11W64-NEXT: ; %bb.157: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX11W64-NEXT: s_branch .LBB2_159 +; GFX11W64-NEXT: .LBB2_158: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_159: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX11W64-NEXT: ; %bb.160: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX11W64-NEXT: s_branch .LBB2_162 +; GFX11W64-NEXT: .LBB2_161: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_162: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX11W64-NEXT: ; %bb.163: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX11W64-NEXT: s_branch .LBB2_165 +; GFX11W64-NEXT: .LBB2_164: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_165: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX11W64-NEXT: ; %bb.166: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX11W64-NEXT: s_branch .LBB2_168 +; GFX11W64-NEXT: .LBB2_167: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_168: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX11W64-NEXT: ; %bb.169: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX11W64-NEXT: s_branch .LBB2_171 +; GFX11W64-NEXT: .LBB2_170: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_171: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX11W64-NEXT: ; %bb.172: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX11W64-NEXT: s_branch .LBB2_174 +; GFX11W64-NEXT: .LBB2_173: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_174: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX11W64-NEXT: ; %bb.175: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX11W64-NEXT: s_branch .LBB2_177 +; GFX11W64-NEXT: .LBB2_176: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_177: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX11W64-NEXT: ; %bb.178: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX11W64-NEXT: s_branch .LBB2_180 +; GFX11W64-NEXT: .LBB2_179: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_180: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX11W64-NEXT: ; %bb.181: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX11W64-NEXT: s_branch .LBB2_183 +; GFX11W64-NEXT: .LBB2_182: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_183: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX11W64-NEXT: ; %bb.184: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX11W64-NEXT: s_branch .LBB2_186 +; GFX11W64-NEXT: .LBB2_185: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_186: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX11W64-NEXT: ; %bb.187: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX11W64-NEXT: s_branch .LBB2_189 +; GFX11W64-NEXT: .LBB2_188: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_189: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX11W64-NEXT: ; %bb.190: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX11W64-NEXT: s_branch .LBB2_192 +; GFX11W64-NEXT: .LBB2_191: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_192: +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_cbranch_execz .LBB2_194 +; GFX11W64-NEXT: ; %bb.193: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_add_i32 s4, s6, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB2_194: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 +; GFX11W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX11W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[4:7], 0 glc +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W32-NEXT: s_branch .LBB2_3 ; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_3: +; GFX11W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX11W32-NEXT: ; %bb.4: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX11W32-NEXT: s_branch .LBB2_6 +; GFX11W32-NEXT: .LBB2_5: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_6: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX11W32-NEXT: ; %bb.7: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX11W32-NEXT: s_branch .LBB2_9 +; GFX11W32-NEXT: .LBB2_8: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_9: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX11W32-NEXT: ; %bb.10: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX11W32-NEXT: s_branch .LBB2_12 +; GFX11W32-NEXT: .LBB2_11: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_12: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX11W32-NEXT: ; %bb.13: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX11W32-NEXT: s_branch .LBB2_15 +; GFX11W32-NEXT: .LBB2_14: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_15: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX11W32-NEXT: ; %bb.16: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX11W32-NEXT: s_branch .LBB2_18 +; GFX11W32-NEXT: .LBB2_17: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_18: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX11W32-NEXT: ; %bb.19: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX11W32-NEXT: s_branch .LBB2_21 +; GFX11W32-NEXT: .LBB2_20: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_21: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX11W32-NEXT: ; %bb.22: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX11W32-NEXT: s_branch .LBB2_24 +; GFX11W32-NEXT: .LBB2_23: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_24: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX11W32-NEXT: ; %bb.25: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX11W32-NEXT: s_branch .LBB2_27 +; GFX11W32-NEXT: .LBB2_26: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_27: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX11W32-NEXT: ; %bb.28: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX11W32-NEXT: s_branch .LBB2_30 +; GFX11W32-NEXT: .LBB2_29: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_30: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX11W32-NEXT: ; %bb.31: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX11W32-NEXT: s_branch .LBB2_33 +; GFX11W32-NEXT: .LBB2_32: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_33: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX11W32-NEXT: ; %bb.34: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX11W32-NEXT: s_branch .LBB2_36 +; GFX11W32-NEXT: .LBB2_35: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_36: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX11W32-NEXT: ; %bb.37: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX11W32-NEXT: s_branch .LBB2_39 +; GFX11W32-NEXT: .LBB2_38: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_39: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX11W32-NEXT: ; %bb.40: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX11W32-NEXT: s_branch .LBB2_42 +; GFX11W32-NEXT: .LBB2_41: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_42: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX11W32-NEXT: ; %bb.43: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX11W32-NEXT: s_branch .LBB2_45 +; GFX11W32-NEXT: .LBB2_44: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_45: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX11W32-NEXT: ; %bb.46: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX11W32-NEXT: s_branch .LBB2_48 +; GFX11W32-NEXT: .LBB2_47: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_48: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX11W32-NEXT: ; %bb.49: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX11W32-NEXT: s_branch .LBB2_51 +; GFX11W32-NEXT: .LBB2_50: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_51: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX11W32-NEXT: ; %bb.52: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX11W32-NEXT: s_branch .LBB2_54 +; GFX11W32-NEXT: .LBB2_53: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_54: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX11W32-NEXT: ; %bb.55: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX11W32-NEXT: s_branch .LBB2_57 +; GFX11W32-NEXT: .LBB2_56: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_57: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX11W32-NEXT: ; %bb.58: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX11W32-NEXT: s_branch .LBB2_60 +; GFX11W32-NEXT: .LBB2_59: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_60: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX11W32-NEXT: ; %bb.61: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX11W32-NEXT: s_branch .LBB2_63 +; GFX11W32-NEXT: .LBB2_62: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_63: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX11W32-NEXT: ; %bb.64: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX11W32-NEXT: s_branch .LBB2_66 +; GFX11W32-NEXT: .LBB2_65: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_66: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX11W32-NEXT: ; %bb.67: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX11W32-NEXT: s_branch .LBB2_69 +; GFX11W32-NEXT: .LBB2_68: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_69: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX11W32-NEXT: ; %bb.70: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX11W32-NEXT: s_branch .LBB2_72 +; GFX11W32-NEXT: .LBB2_71: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_72: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX11W32-NEXT: ; %bb.73: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX11W32-NEXT: s_branch .LBB2_75 +; GFX11W32-NEXT: .LBB2_74: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_75: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX11W32-NEXT: ; %bb.76: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX11W32-NEXT: s_branch .LBB2_78 +; GFX11W32-NEXT: .LBB2_77: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_78: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX11W32-NEXT: ; %bb.79: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX11W32-NEXT: s_branch .LBB2_81 +; GFX11W32-NEXT: .LBB2_80: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_81: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX11W32-NEXT: ; %bb.82: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX11W32-NEXT: s_branch .LBB2_84 +; GFX11W32-NEXT: .LBB2_83: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_84: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX11W32-NEXT: ; %bb.85: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX11W32-NEXT: s_branch .LBB2_87 +; GFX11W32-NEXT: .LBB2_86: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_87: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX11W32-NEXT: ; %bb.88: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX11W32-NEXT: s_branch .LBB2_90 +; GFX11W32-NEXT: .LBB2_89: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_90: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX11W32-NEXT: ; %bb.91: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX11W32-NEXT: s_branch .LBB2_93 +; GFX11W32-NEXT: .LBB2_92: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_93: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX11W32-NEXT: ; %bb.94: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX11W32-NEXT: s_branch .LBB2_96 +; GFX11W32-NEXT: .LBB2_95: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_96: +; GFX11W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB2_98 +; GFX11W32-NEXT: ; %bb.97: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB2_98: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -782,328 +5495,5042 @@ ; ; GFX8-LABEL: struct_add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB3_3 +; GFX8-NEXT: .LBB3_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB3_6 +; GFX8-NEXT: .LBB3_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB3_9 +; GFX8-NEXT: .LBB3_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB3_12 +; GFX8-NEXT: .LBB3_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB3_15 +; GFX8-NEXT: .LBB3_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB3_18 +; GFX8-NEXT: .LBB3_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB3_21 +; GFX8-NEXT: .LBB3_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB3_24 +; GFX8-NEXT: .LBB3_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB3_27 +; GFX8-NEXT: .LBB3_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB3_30 +; GFX8-NEXT: .LBB3_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB3_33 +; GFX8-NEXT: .LBB3_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB3_36 +; GFX8-NEXT: .LBB3_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB3_39 +; GFX8-NEXT: .LBB3_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB3_42 +; GFX8-NEXT: .LBB3_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB3_45 +; GFX8-NEXT: .LBB3_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB3_48 +; GFX8-NEXT: .LBB3_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB3_51 +; GFX8-NEXT: .LBB3_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB3_54 +; GFX8-NEXT: .LBB3_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB3_57 +; GFX8-NEXT: .LBB3_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB3_60 +; GFX8-NEXT: .LBB3_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB3_63 +; GFX8-NEXT: .LBB3_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB3_66 +; GFX8-NEXT: .LBB3_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB3_69 +; GFX8-NEXT: .LBB3_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB3_72 +; GFX8-NEXT: .LBB3_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB3_75 +; GFX8-NEXT: .LBB3_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB3_78 +; GFX8-NEXT: .LBB3_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB3_81 +; GFX8-NEXT: .LBB3_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB3_84 +; GFX8-NEXT: .LBB3_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB3_87 +; GFX8-NEXT: .LBB3_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB3_90 +; GFX8-NEXT: .LBB3_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB3_93 +; GFX8-NEXT: .LBB3_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB3_96 +; GFX8-NEXT: .LBB3_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB3_99 +; GFX8-NEXT: .LBB3_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB3_102 +; GFX8-NEXT: .LBB3_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB3_105 +; GFX8-NEXT: .LBB3_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB3_108 +; GFX8-NEXT: .LBB3_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB3_111 +; GFX8-NEXT: .LBB3_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB3_114 +; GFX8-NEXT: .LBB3_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB3_117 +; GFX8-NEXT: .LBB3_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB3_120 +; GFX8-NEXT: .LBB3_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB3_123 +; GFX8-NEXT: .LBB3_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB3_126 +; GFX8-NEXT: .LBB3_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB3_129 +; GFX8-NEXT: .LBB3_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB3_132 +; GFX8-NEXT: .LBB3_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB3_135 +; GFX8-NEXT: .LBB3_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB3_138 +; GFX8-NEXT: .LBB3_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB3_141 +; GFX8-NEXT: .LBB3_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB3_144 +; GFX8-NEXT: .LBB3_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB3_147 +; GFX8-NEXT: .LBB3_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB3_150 +; GFX8-NEXT: .LBB3_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB3_153 +; GFX8-NEXT: .LBB3_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB3_156 +; GFX8-NEXT: .LBB3_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB3_159 +; GFX8-NEXT: .LBB3_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB3_162 +; GFX8-NEXT: .LBB3_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB3_165 +; GFX8-NEXT: .LBB3_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB3_168 +; GFX8-NEXT: .LBB3_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB3_171 +; GFX8-NEXT: .LBB3_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB3_174 +; GFX8-NEXT: .LBB3_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB3_177 +; GFX8-NEXT: .LBB3_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB3_180 +; GFX8-NEXT: .LBB3_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB3_183 +; GFX8-NEXT: .LBB3_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB3_186 +; GFX8-NEXT: .LBB3_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB3_189 +; GFX8-NEXT: .LBB3_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB3_192 +; GFX8-NEXT: .LBB3_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB3_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB3_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x44 +; GFX8-NEXT: s_cbranch_execz .LBB3_194 +; GFX8-NEXT: ; %bb.193: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_load_dword s12, s[0:1], 0x44 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB3_2: +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB3_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB3_3 +; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB3_6 +; GFX9-NEXT: .LBB3_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB3_9 +; GFX9-NEXT: .LBB3_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB3_12 +; GFX9-NEXT: .LBB3_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB3_15 +; GFX9-NEXT: .LBB3_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB3_18 +; GFX9-NEXT: .LBB3_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB3_21 +; GFX9-NEXT: .LBB3_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB3_24 +; GFX9-NEXT: .LBB3_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB3_27 +; GFX9-NEXT: .LBB3_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB3_30 +; GFX9-NEXT: .LBB3_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB3_33 +; GFX9-NEXT: .LBB3_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB3_36 +; GFX9-NEXT: .LBB3_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB3_39 +; GFX9-NEXT: .LBB3_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB3_42 +; GFX9-NEXT: .LBB3_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB3_45 +; GFX9-NEXT: .LBB3_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB3_48 +; GFX9-NEXT: .LBB3_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB3_51 +; GFX9-NEXT: .LBB3_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB3_54 +; GFX9-NEXT: .LBB3_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB3_57 +; GFX9-NEXT: .LBB3_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB3_60 +; GFX9-NEXT: .LBB3_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB3_63 +; GFX9-NEXT: .LBB3_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB3_66 +; GFX9-NEXT: .LBB3_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB3_69 +; GFX9-NEXT: .LBB3_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB3_72 +; GFX9-NEXT: .LBB3_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB3_75 +; GFX9-NEXT: .LBB3_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB3_78 +; GFX9-NEXT: .LBB3_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB3_81 +; GFX9-NEXT: .LBB3_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB3_84 +; GFX9-NEXT: .LBB3_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB3_87 +; GFX9-NEXT: .LBB3_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB3_90 +; GFX9-NEXT: .LBB3_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB3_93 +; GFX9-NEXT: .LBB3_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB3_96 +; GFX9-NEXT: .LBB3_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB3_99 +; GFX9-NEXT: .LBB3_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB3_102 +; GFX9-NEXT: .LBB3_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB3_105 +; GFX9-NEXT: .LBB3_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB3_108 +; GFX9-NEXT: .LBB3_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB3_111 +; GFX9-NEXT: .LBB3_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB3_114 +; GFX9-NEXT: .LBB3_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB3_117 +; GFX9-NEXT: .LBB3_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB3_120 +; GFX9-NEXT: .LBB3_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB3_123 +; GFX9-NEXT: .LBB3_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB3_126 +; GFX9-NEXT: .LBB3_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB3_129 +; GFX9-NEXT: .LBB3_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB3_132 +; GFX9-NEXT: .LBB3_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB3_135 +; GFX9-NEXT: .LBB3_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB3_138 +; GFX9-NEXT: .LBB3_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB3_141 +; GFX9-NEXT: .LBB3_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB3_144 +; GFX9-NEXT: .LBB3_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB3_147 +; GFX9-NEXT: .LBB3_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB3_150 +; GFX9-NEXT: .LBB3_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB3_153 +; GFX9-NEXT: .LBB3_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB3_156 +; GFX9-NEXT: .LBB3_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB3_159 +; GFX9-NEXT: .LBB3_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB3_162 +; GFX9-NEXT: .LBB3_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB3_165 +; GFX9-NEXT: .LBB3_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB3_168 +; GFX9-NEXT: .LBB3_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB3_171 +; GFX9-NEXT: .LBB3_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB3_174 +; GFX9-NEXT: .LBB3_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB3_177 +; GFX9-NEXT: .LBB3_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB3_180 +; GFX9-NEXT: .LBB3_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB3_183 +; GFX9-NEXT: .LBB3_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB3_186 +; GFX9-NEXT: .LBB3_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB3_189 +; GFX9-NEXT: .LBB3_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB3_192 +; GFX9-NEXT: .LBB3_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB3_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB3_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_load_dword s5, s[0:1], 0x44 +; GFX9-NEXT: s_cbranch_execz .LBB3_194 +; GFX9-NEXT: ; %bb.193: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_load_dword s12, s[0:1], 0x44 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: buffer_atomic_add v0, v4, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB3_2: +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB3_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: struct_add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX10W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_2 +; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W64-NEXT: s_branch .LBB3_3 +; GFX10W64-NEXT: .LBB3_2: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_3: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX10W64-NEXT: ; %bb.4: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX10W64-NEXT: s_branch .LBB3_6 +; GFX10W64-NEXT: .LBB3_5: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_6: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_8 +; GFX10W64-NEXT: ; %bb.7: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX10W64-NEXT: s_branch .LBB3_9 +; GFX10W64-NEXT: .LBB3_8: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_9: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_11 +; GFX10W64-NEXT: ; %bb.10: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX10W64-NEXT: s_branch .LBB3_12 +; GFX10W64-NEXT: .LBB3_11: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_12: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_14 +; GFX10W64-NEXT: ; %bb.13: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX10W64-NEXT: s_branch .LBB3_15 +; GFX10W64-NEXT: .LBB3_14: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_15: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_17 +; GFX10W64-NEXT: ; %bb.16: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX10W64-NEXT: s_branch .LBB3_18 +; GFX10W64-NEXT: .LBB3_17: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_18: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_20 +; GFX10W64-NEXT: ; %bb.19: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX10W64-NEXT: s_branch .LBB3_21 +; GFX10W64-NEXT: .LBB3_20: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_21: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_23 +; GFX10W64-NEXT: ; %bb.22: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX10W64-NEXT: s_branch .LBB3_24 +; GFX10W64-NEXT: .LBB3_23: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_24: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_26 +; GFX10W64-NEXT: ; %bb.25: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX10W64-NEXT: s_branch .LBB3_27 +; GFX10W64-NEXT: .LBB3_26: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_27: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_29 +; GFX10W64-NEXT: ; %bb.28: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX10W64-NEXT: s_branch .LBB3_30 +; GFX10W64-NEXT: .LBB3_29: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_30: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_32 +; GFX10W64-NEXT: ; %bb.31: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX10W64-NEXT: s_branch .LBB3_33 +; GFX10W64-NEXT: .LBB3_32: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_33: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_35 +; GFX10W64-NEXT: ; %bb.34: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX10W64-NEXT: s_branch .LBB3_36 +; GFX10W64-NEXT: .LBB3_35: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_36: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_38 +; GFX10W64-NEXT: ; %bb.37: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX10W64-NEXT: s_branch .LBB3_39 +; GFX10W64-NEXT: .LBB3_38: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_39: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_41 +; GFX10W64-NEXT: ; %bb.40: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX10W64-NEXT: s_branch .LBB3_42 +; GFX10W64-NEXT: .LBB3_41: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_42: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_44 +; GFX10W64-NEXT: ; %bb.43: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX10W64-NEXT: s_branch .LBB3_45 +; GFX10W64-NEXT: .LBB3_44: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_45: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_47 +; GFX10W64-NEXT: ; %bb.46: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX10W64-NEXT: s_branch .LBB3_48 +; GFX10W64-NEXT: .LBB3_47: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_48: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_50 +; GFX10W64-NEXT: ; %bb.49: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX10W64-NEXT: s_branch .LBB3_51 +; GFX10W64-NEXT: .LBB3_50: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_51: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_53 +; GFX10W64-NEXT: ; %bb.52: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX10W64-NEXT: s_branch .LBB3_54 +; GFX10W64-NEXT: .LBB3_53: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_54: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_56 +; GFX10W64-NEXT: ; %bb.55: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX10W64-NEXT: s_branch .LBB3_57 +; GFX10W64-NEXT: .LBB3_56: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_57: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_59 +; GFX10W64-NEXT: ; %bb.58: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX10W64-NEXT: s_branch .LBB3_60 +; GFX10W64-NEXT: .LBB3_59: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_60: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_62 +; GFX10W64-NEXT: ; %bb.61: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX10W64-NEXT: s_branch .LBB3_63 +; GFX10W64-NEXT: .LBB3_62: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_63: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_65 +; GFX10W64-NEXT: ; %bb.64: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX10W64-NEXT: s_branch .LBB3_66 +; GFX10W64-NEXT: .LBB3_65: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_66: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_68 +; GFX10W64-NEXT: ; %bb.67: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX10W64-NEXT: s_branch .LBB3_69 +; GFX10W64-NEXT: .LBB3_68: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_69: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_71 +; GFX10W64-NEXT: ; %bb.70: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX10W64-NEXT: s_branch .LBB3_72 +; GFX10W64-NEXT: .LBB3_71: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_72: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_74 +; GFX10W64-NEXT: ; %bb.73: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX10W64-NEXT: s_branch .LBB3_75 +; GFX10W64-NEXT: .LBB3_74: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_75: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_77 +; GFX10W64-NEXT: ; %bb.76: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX10W64-NEXT: s_branch .LBB3_78 +; GFX10W64-NEXT: .LBB3_77: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_78: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_80 +; GFX10W64-NEXT: ; %bb.79: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX10W64-NEXT: s_branch .LBB3_81 +; GFX10W64-NEXT: .LBB3_80: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_81: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_83 +; GFX10W64-NEXT: ; %bb.82: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX10W64-NEXT: s_branch .LBB3_84 +; GFX10W64-NEXT: .LBB3_83: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_84: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_86 +; GFX10W64-NEXT: ; %bb.85: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX10W64-NEXT: s_branch .LBB3_87 +; GFX10W64-NEXT: .LBB3_86: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_87: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_89 +; GFX10W64-NEXT: ; %bb.88: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX10W64-NEXT: s_branch .LBB3_90 +; GFX10W64-NEXT: .LBB3_89: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_90: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX10W64-NEXT: s_add_i32 s4, s6, s2 +; GFX10W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX10W64-NEXT: s_mov_b32 s7, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_92 +; GFX10W64-NEXT: ; %bb.91: +; GFX10W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX10W64-NEXT: s_branch .LBB3_93 +; GFX10W64-NEXT: .LBB3_92: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_93: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s5, 0 +; GFX10W64-NEXT: s_add_i32 s6, s4, s2 +; GFX10W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_95 +; GFX10W64-NEXT: ; %bb.94: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX10W64-NEXT: s_branch .LBB3_96 +; GFX10W64-NEXT: .LBB3_95: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_96: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_98 +; GFX10W64-NEXT: ; %bb.97: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX10W64-NEXT: s_branch .LBB3_99 +; GFX10W64-NEXT: .LBB3_98: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_99: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_101 +; GFX10W64-NEXT: ; %bb.100: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX10W64-NEXT: s_branch .LBB3_102 +; GFX10W64-NEXT: .LBB3_101: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_102: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_104 +; GFX10W64-NEXT: ; %bb.103: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX10W64-NEXT: s_branch .LBB3_105 +; GFX10W64-NEXT: .LBB3_104: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_105: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_107 +; GFX10W64-NEXT: ; %bb.106: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX10W64-NEXT: s_branch .LBB3_108 +; GFX10W64-NEXT: .LBB3_107: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_108: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_110 +; GFX10W64-NEXT: ; %bb.109: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX10W64-NEXT: s_branch .LBB3_111 +; GFX10W64-NEXT: .LBB3_110: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_111: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_113 +; GFX10W64-NEXT: ; %bb.112: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX10W64-NEXT: s_branch .LBB3_114 +; GFX10W64-NEXT: .LBB3_113: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_114: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_116 +; GFX10W64-NEXT: ; %bb.115: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX10W64-NEXT: s_branch .LBB3_117 +; GFX10W64-NEXT: .LBB3_116: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_117: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_119 +; GFX10W64-NEXT: ; %bb.118: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX10W64-NEXT: s_branch .LBB3_120 +; GFX10W64-NEXT: .LBB3_119: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_120: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_122 +; GFX10W64-NEXT: ; %bb.121: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX10W64-NEXT: s_branch .LBB3_123 +; GFX10W64-NEXT: .LBB3_122: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_123: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_125 +; GFX10W64-NEXT: ; %bb.124: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX10W64-NEXT: s_branch .LBB3_126 +; GFX10W64-NEXT: .LBB3_125: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_126: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_128 +; GFX10W64-NEXT: ; %bb.127: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX10W64-NEXT: s_branch .LBB3_129 +; GFX10W64-NEXT: .LBB3_128: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_129: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_131 +; GFX10W64-NEXT: ; %bb.130: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX10W64-NEXT: s_branch .LBB3_132 +; GFX10W64-NEXT: .LBB3_131: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_132: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_134 +; GFX10W64-NEXT: ; %bb.133: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX10W64-NEXT: s_branch .LBB3_135 +; GFX10W64-NEXT: .LBB3_134: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_135: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_137 +; GFX10W64-NEXT: ; %bb.136: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX10W64-NEXT: s_branch .LBB3_138 +; GFX10W64-NEXT: .LBB3_137: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_138: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_140 +; GFX10W64-NEXT: ; %bb.139: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX10W64-NEXT: s_branch .LBB3_141 +; GFX10W64-NEXT: .LBB3_140: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_141: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_143 +; GFX10W64-NEXT: ; %bb.142: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX10W64-NEXT: s_branch .LBB3_144 +; GFX10W64-NEXT: .LBB3_143: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_144: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_146 +; GFX10W64-NEXT: ; %bb.145: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX10W64-NEXT: s_branch .LBB3_147 +; GFX10W64-NEXT: .LBB3_146: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_147: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_149 +; GFX10W64-NEXT: ; %bb.148: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX10W64-NEXT: s_branch .LBB3_150 +; GFX10W64-NEXT: .LBB3_149: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_150: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_152 +; GFX10W64-NEXT: ; %bb.151: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX10W64-NEXT: s_branch .LBB3_153 +; GFX10W64-NEXT: .LBB3_152: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_153: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_155 +; GFX10W64-NEXT: ; %bb.154: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX10W64-NEXT: s_branch .LBB3_156 +; GFX10W64-NEXT: .LBB3_155: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_156: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_158 +; GFX10W64-NEXT: ; %bb.157: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX10W64-NEXT: s_branch .LBB3_159 +; GFX10W64-NEXT: .LBB3_158: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_159: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_161 +; GFX10W64-NEXT: ; %bb.160: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX10W64-NEXT: s_branch .LBB3_162 +; GFX10W64-NEXT: .LBB3_161: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_162: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_164 +; GFX10W64-NEXT: ; %bb.163: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX10W64-NEXT: s_branch .LBB3_165 +; GFX10W64-NEXT: .LBB3_164: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_165: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_167 +; GFX10W64-NEXT: ; %bb.166: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX10W64-NEXT: s_branch .LBB3_168 +; GFX10W64-NEXT: .LBB3_167: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_168: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_170 +; GFX10W64-NEXT: ; %bb.169: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX10W64-NEXT: s_branch .LBB3_171 +; GFX10W64-NEXT: .LBB3_170: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_171: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_173 +; GFX10W64-NEXT: ; %bb.172: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX10W64-NEXT: s_branch .LBB3_174 +; GFX10W64-NEXT: .LBB3_173: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_174: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_176 +; GFX10W64-NEXT: ; %bb.175: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX10W64-NEXT: s_branch .LBB3_177 +; GFX10W64-NEXT: .LBB3_176: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_177: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_179 +; GFX10W64-NEXT: ; %bb.178: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX10W64-NEXT: s_branch .LBB3_180 +; GFX10W64-NEXT: .LBB3_179: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_180: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_182 +; GFX10W64-NEXT: ; %bb.181: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX10W64-NEXT: s_branch .LBB3_183 +; GFX10W64-NEXT: .LBB3_182: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_183: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_185 +; GFX10W64-NEXT: ; %bb.184: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX10W64-NEXT: s_branch .LBB3_186 +; GFX10W64-NEXT: .LBB3_185: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_186: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_188 +; GFX10W64-NEXT: ; %bb.187: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX10W64-NEXT: s_branch .LBB3_189 +; GFX10W64-NEXT: .LBB3_188: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_189: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_191 +; GFX10W64-NEXT: ; %bb.190: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX10W64-NEXT: s_branch .LBB3_192 +; GFX10W64-NEXT: .LBB3_191: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB3_192: +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB3_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_cbranch_execz .LBB3_194 +; GFX10W64-NEXT: ; %bb.193: ; GFX10W64-NEXT: s_clause 0x1 -; GFX10W64-NEXT: s_load_dword s5, s[0:1], 0x44 +; GFX10W64-NEXT: s_load_dword s12, s[0:1], 0x44 ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_add_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: v_mov_b32_e32 v5, s5 -; GFX10W64-NEXT: buffer_atomic_add v4, v5, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB3_2: +; GFX10W64-NEXT: v_mov_b32_e32 v2, s12 +; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB3_194: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB3_2 +; GFX10W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX10W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 +; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W32-NEXT: s_branch .LBB3_3 +; GFX10W32-NEXT: .LBB3_2: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_3: +; GFX10W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX10W32-NEXT: ; %bb.4: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX10W32-NEXT: s_branch .LBB3_6 +; GFX10W32-NEXT: .LBB3_5: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_6: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_8 +; GFX10W32-NEXT: ; %bb.7: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX10W32-NEXT: s_branch .LBB3_9 +; GFX10W32-NEXT: .LBB3_8: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_9: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_11 +; GFX10W32-NEXT: ; %bb.10: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX10W32-NEXT: s_branch .LBB3_12 +; GFX10W32-NEXT: .LBB3_11: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_12: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_14 +; GFX10W32-NEXT: ; %bb.13: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX10W32-NEXT: s_branch .LBB3_15 +; GFX10W32-NEXT: .LBB3_14: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_15: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_17 +; GFX10W32-NEXT: ; %bb.16: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX10W32-NEXT: s_branch .LBB3_18 +; GFX10W32-NEXT: .LBB3_17: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_18: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_20 +; GFX10W32-NEXT: ; %bb.19: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX10W32-NEXT: s_branch .LBB3_21 +; GFX10W32-NEXT: .LBB3_20: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_21: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_23 +; GFX10W32-NEXT: ; %bb.22: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX10W32-NEXT: s_branch .LBB3_24 +; GFX10W32-NEXT: .LBB3_23: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_24: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_26 +; GFX10W32-NEXT: ; %bb.25: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX10W32-NEXT: s_branch .LBB3_27 +; GFX10W32-NEXT: .LBB3_26: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_27: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_29 +; GFX10W32-NEXT: ; %bb.28: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX10W32-NEXT: s_branch .LBB3_30 +; GFX10W32-NEXT: .LBB3_29: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_30: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_32 +; GFX10W32-NEXT: ; %bb.31: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX10W32-NEXT: s_branch .LBB3_33 +; GFX10W32-NEXT: .LBB3_32: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_33: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_35 +; GFX10W32-NEXT: ; %bb.34: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX10W32-NEXT: s_branch .LBB3_36 +; GFX10W32-NEXT: .LBB3_35: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_36: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_38 +; GFX10W32-NEXT: ; %bb.37: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX10W32-NEXT: s_branch .LBB3_39 +; GFX10W32-NEXT: .LBB3_38: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_39: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_41 +; GFX10W32-NEXT: ; %bb.40: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX10W32-NEXT: s_branch .LBB3_42 +; GFX10W32-NEXT: .LBB3_41: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_42: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_44 +; GFX10W32-NEXT: ; %bb.43: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX10W32-NEXT: s_branch .LBB3_45 +; GFX10W32-NEXT: .LBB3_44: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_45: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_47 +; GFX10W32-NEXT: ; %bb.46: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX10W32-NEXT: s_branch .LBB3_48 +; GFX10W32-NEXT: .LBB3_47: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_48: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_50 +; GFX10W32-NEXT: ; %bb.49: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX10W32-NEXT: s_branch .LBB3_51 +; GFX10W32-NEXT: .LBB3_50: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_51: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_53 +; GFX10W32-NEXT: ; %bb.52: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX10W32-NEXT: s_branch .LBB3_54 +; GFX10W32-NEXT: .LBB3_53: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_54: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_56 +; GFX10W32-NEXT: ; %bb.55: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX10W32-NEXT: s_branch .LBB3_57 +; GFX10W32-NEXT: .LBB3_56: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_57: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_59 +; GFX10W32-NEXT: ; %bb.58: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX10W32-NEXT: s_branch .LBB3_60 +; GFX10W32-NEXT: .LBB3_59: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_60: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_62 +; GFX10W32-NEXT: ; %bb.61: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX10W32-NEXT: s_branch .LBB3_63 +; GFX10W32-NEXT: .LBB3_62: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_63: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_65 +; GFX10W32-NEXT: ; %bb.64: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX10W32-NEXT: s_branch .LBB3_66 +; GFX10W32-NEXT: .LBB3_65: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_66: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_68 +; GFX10W32-NEXT: ; %bb.67: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX10W32-NEXT: s_branch .LBB3_69 +; GFX10W32-NEXT: .LBB3_68: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_69: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_71 +; GFX10W32-NEXT: ; %bb.70: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX10W32-NEXT: s_branch .LBB3_72 +; GFX10W32-NEXT: .LBB3_71: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_72: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_74 +; GFX10W32-NEXT: ; %bb.73: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX10W32-NEXT: s_branch .LBB3_75 +; GFX10W32-NEXT: .LBB3_74: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_75: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_77 +; GFX10W32-NEXT: ; %bb.76: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX10W32-NEXT: s_branch .LBB3_78 +; GFX10W32-NEXT: .LBB3_77: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_78: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_80 +; GFX10W32-NEXT: ; %bb.79: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX10W32-NEXT: s_branch .LBB3_81 +; GFX10W32-NEXT: .LBB3_80: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_81: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_83 +; GFX10W32-NEXT: ; %bb.82: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX10W32-NEXT: s_branch .LBB3_84 +; GFX10W32-NEXT: .LBB3_83: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_84: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_86 +; GFX10W32-NEXT: ; %bb.85: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX10W32-NEXT: s_branch .LBB3_87 +; GFX10W32-NEXT: .LBB3_86: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_87: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_89 +; GFX10W32-NEXT: ; %bb.88: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX10W32-NEXT: s_branch .LBB3_90 +; GFX10W32-NEXT: .LBB3_89: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_90: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_92 +; GFX10W32-NEXT: ; %bb.91: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX10W32-NEXT: s_branch .LBB3_93 +; GFX10W32-NEXT: .LBB3_92: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_93: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_95 +; GFX10W32-NEXT: ; %bb.94: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX10W32-NEXT: s_branch .LBB3_96 +; GFX10W32-NEXT: .LBB3_95: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB3_96: +; GFX10W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_cbranch_execz .LBB3_98 +; GFX10W32-NEXT: ; %bb.97: ; GFX10W32-NEXT: s_clause 0x1 -; GFX10W32-NEXT: s_load_dword s8, s[0:1], 0x44 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX10W32-NEXT: s_load_dword s6, s[0:1], 0x44 +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: v_mov_b32_e32 v5, s8 -; GFX10W32-NEXT: buffer_atomic_add v4, v5, s[4:7], 0 idxen glc -; GFX10W32-NEXT: .LBB3_2: +; GFX10W32-NEXT: v_mov_b32_e32 v2, s6 +; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX10W32-NEXT: .LBB3_98: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX11W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_2 +; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W64-NEXT: s_branch .LBB3_3 +; GFX11W64-NEXT: .LBB3_2: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_3: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX11W64-NEXT: ; %bb.4: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX11W64-NEXT: s_branch .LBB3_6 +; GFX11W64-NEXT: .LBB3_5: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_6: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_8 +; GFX11W64-NEXT: ; %bb.7: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX11W64-NEXT: s_branch .LBB3_9 +; GFX11W64-NEXT: .LBB3_8: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_9: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_11 +; GFX11W64-NEXT: ; %bb.10: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX11W64-NEXT: s_branch .LBB3_12 +; GFX11W64-NEXT: .LBB3_11: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_12: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_14 +; GFX11W64-NEXT: ; %bb.13: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX11W64-NEXT: s_branch .LBB3_15 +; GFX11W64-NEXT: .LBB3_14: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_15: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_17 +; GFX11W64-NEXT: ; %bb.16: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX11W64-NEXT: s_branch .LBB3_18 +; GFX11W64-NEXT: .LBB3_17: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_18: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_20 +; GFX11W64-NEXT: ; %bb.19: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX11W64-NEXT: s_branch .LBB3_21 +; GFX11W64-NEXT: .LBB3_20: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_21: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_23 +; GFX11W64-NEXT: ; %bb.22: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX11W64-NEXT: s_branch .LBB3_24 +; GFX11W64-NEXT: .LBB3_23: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_24: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_26 +; GFX11W64-NEXT: ; %bb.25: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX11W64-NEXT: s_branch .LBB3_27 +; GFX11W64-NEXT: .LBB3_26: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_27: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_29 +; GFX11W64-NEXT: ; %bb.28: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX11W64-NEXT: s_branch .LBB3_30 +; GFX11W64-NEXT: .LBB3_29: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_30: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_32 +; GFX11W64-NEXT: ; %bb.31: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX11W64-NEXT: s_branch .LBB3_33 +; GFX11W64-NEXT: .LBB3_32: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_33: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_35 +; GFX11W64-NEXT: ; %bb.34: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX11W64-NEXT: s_branch .LBB3_36 +; GFX11W64-NEXT: .LBB3_35: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_36: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_38 +; GFX11W64-NEXT: ; %bb.37: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX11W64-NEXT: s_branch .LBB3_39 +; GFX11W64-NEXT: .LBB3_38: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_39: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_41 +; GFX11W64-NEXT: ; %bb.40: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX11W64-NEXT: s_branch .LBB3_42 +; GFX11W64-NEXT: .LBB3_41: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_42: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_44 +; GFX11W64-NEXT: ; %bb.43: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX11W64-NEXT: s_branch .LBB3_45 +; GFX11W64-NEXT: .LBB3_44: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_45: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_47 +; GFX11W64-NEXT: ; %bb.46: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX11W64-NEXT: s_branch .LBB3_48 +; GFX11W64-NEXT: .LBB3_47: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_48: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_50 +; GFX11W64-NEXT: ; %bb.49: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX11W64-NEXT: s_branch .LBB3_51 +; GFX11W64-NEXT: .LBB3_50: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_51: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_53 +; GFX11W64-NEXT: ; %bb.52: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX11W64-NEXT: s_branch .LBB3_54 +; GFX11W64-NEXT: .LBB3_53: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_54: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_56 +; GFX11W64-NEXT: ; %bb.55: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX11W64-NEXT: s_branch .LBB3_57 +; GFX11W64-NEXT: .LBB3_56: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_57: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_59 +; GFX11W64-NEXT: ; %bb.58: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX11W64-NEXT: s_branch .LBB3_60 +; GFX11W64-NEXT: .LBB3_59: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_60: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_62 +; GFX11W64-NEXT: ; %bb.61: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX11W64-NEXT: s_branch .LBB3_63 +; GFX11W64-NEXT: .LBB3_62: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_63: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_65 +; GFX11W64-NEXT: ; %bb.64: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX11W64-NEXT: s_branch .LBB3_66 +; GFX11W64-NEXT: .LBB3_65: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_66: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_68 +; GFX11W64-NEXT: ; %bb.67: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX11W64-NEXT: s_branch .LBB3_69 +; GFX11W64-NEXT: .LBB3_68: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_69: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_71 +; GFX11W64-NEXT: ; %bb.70: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX11W64-NEXT: s_branch .LBB3_72 +; GFX11W64-NEXT: .LBB3_71: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_72: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_74 +; GFX11W64-NEXT: ; %bb.73: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX11W64-NEXT: s_branch .LBB3_75 +; GFX11W64-NEXT: .LBB3_74: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_75: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_77 +; GFX11W64-NEXT: ; %bb.76: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX11W64-NEXT: s_branch .LBB3_78 +; GFX11W64-NEXT: .LBB3_77: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_78: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_80 +; GFX11W64-NEXT: ; %bb.79: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX11W64-NEXT: s_branch .LBB3_81 +; GFX11W64-NEXT: .LBB3_80: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_81: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_83 +; GFX11W64-NEXT: ; %bb.82: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX11W64-NEXT: s_branch .LBB3_84 +; GFX11W64-NEXT: .LBB3_83: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_84: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_86 +; GFX11W64-NEXT: ; %bb.85: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX11W64-NEXT: s_branch .LBB3_87 +; GFX11W64-NEXT: .LBB3_86: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_87: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_89 +; GFX11W64-NEXT: ; %bb.88: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX11W64-NEXT: s_branch .LBB3_90 +; GFX11W64-NEXT: .LBB3_89: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_90: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX11W64-NEXT: s_add_i32 s4, s6, s2 +; GFX11W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX11W64-NEXT: s_mov_b32 s7, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_92 +; GFX11W64-NEXT: ; %bb.91: +; GFX11W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX11W64-NEXT: s_branch .LBB3_93 +; GFX11W64-NEXT: .LBB3_92: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_93: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s5, 0 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: s_add_i32 s6, s4, s2 +; GFX11W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_95 +; GFX11W64-NEXT: ; %bb.94: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX11W64-NEXT: s_branch .LBB3_96 +; GFX11W64-NEXT: .LBB3_95: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_96: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_98 +; GFX11W64-NEXT: ; %bb.97: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX11W64-NEXT: s_branch .LBB3_99 +; GFX11W64-NEXT: .LBB3_98: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_99: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_101 +; GFX11W64-NEXT: ; %bb.100: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX11W64-NEXT: s_branch .LBB3_102 +; GFX11W64-NEXT: .LBB3_101: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_102: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_104 +; GFX11W64-NEXT: ; %bb.103: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX11W64-NEXT: s_branch .LBB3_105 +; GFX11W64-NEXT: .LBB3_104: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_105: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_107 +; GFX11W64-NEXT: ; %bb.106: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX11W64-NEXT: s_branch .LBB3_108 +; GFX11W64-NEXT: .LBB3_107: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_108: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_110 +; GFX11W64-NEXT: ; %bb.109: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX11W64-NEXT: s_branch .LBB3_111 +; GFX11W64-NEXT: .LBB3_110: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_111: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_113 +; GFX11W64-NEXT: ; %bb.112: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX11W64-NEXT: s_branch .LBB3_114 +; GFX11W64-NEXT: .LBB3_113: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_114: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_116 +; GFX11W64-NEXT: ; %bb.115: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX11W64-NEXT: s_branch .LBB3_117 +; GFX11W64-NEXT: .LBB3_116: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_117: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_119 +; GFX11W64-NEXT: ; %bb.118: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX11W64-NEXT: s_branch .LBB3_120 +; GFX11W64-NEXT: .LBB3_119: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_120: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_122 +; GFX11W64-NEXT: ; %bb.121: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX11W64-NEXT: s_branch .LBB3_123 +; GFX11W64-NEXT: .LBB3_122: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_123: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_125 +; GFX11W64-NEXT: ; %bb.124: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX11W64-NEXT: s_branch .LBB3_126 +; GFX11W64-NEXT: .LBB3_125: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_126: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_128 +; GFX11W64-NEXT: ; %bb.127: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX11W64-NEXT: s_branch .LBB3_129 +; GFX11W64-NEXT: .LBB3_128: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_129: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_131 +; GFX11W64-NEXT: ; %bb.130: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX11W64-NEXT: s_branch .LBB3_132 +; GFX11W64-NEXT: .LBB3_131: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_132: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_134 +; GFX11W64-NEXT: ; %bb.133: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX11W64-NEXT: s_branch .LBB3_135 +; GFX11W64-NEXT: .LBB3_134: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_135: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_137 +; GFX11W64-NEXT: ; %bb.136: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX11W64-NEXT: s_branch .LBB3_138 +; GFX11W64-NEXT: .LBB3_137: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_138: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_140 +; GFX11W64-NEXT: ; %bb.139: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX11W64-NEXT: s_branch .LBB3_141 +; GFX11W64-NEXT: .LBB3_140: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_141: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_143 +; GFX11W64-NEXT: ; %bb.142: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX11W64-NEXT: s_branch .LBB3_144 +; GFX11W64-NEXT: .LBB3_143: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_144: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_146 +; GFX11W64-NEXT: ; %bb.145: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX11W64-NEXT: s_branch .LBB3_147 +; GFX11W64-NEXT: .LBB3_146: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_147: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_149 +; GFX11W64-NEXT: ; %bb.148: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX11W64-NEXT: s_branch .LBB3_150 +; GFX11W64-NEXT: .LBB3_149: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_150: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_152 +; GFX11W64-NEXT: ; %bb.151: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX11W64-NEXT: s_branch .LBB3_153 +; GFX11W64-NEXT: .LBB3_152: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_153: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_155 +; GFX11W64-NEXT: ; %bb.154: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX11W64-NEXT: s_branch .LBB3_156 +; GFX11W64-NEXT: .LBB3_155: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_156: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_158 +; GFX11W64-NEXT: ; %bb.157: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX11W64-NEXT: s_branch .LBB3_159 +; GFX11W64-NEXT: .LBB3_158: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_159: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_161 +; GFX11W64-NEXT: ; %bb.160: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX11W64-NEXT: s_branch .LBB3_162 +; GFX11W64-NEXT: .LBB3_161: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_162: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_164 +; GFX11W64-NEXT: ; %bb.163: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX11W64-NEXT: s_branch .LBB3_165 +; GFX11W64-NEXT: .LBB3_164: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_165: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_167 +; GFX11W64-NEXT: ; %bb.166: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX11W64-NEXT: s_branch .LBB3_168 +; GFX11W64-NEXT: .LBB3_167: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_168: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_170 +; GFX11W64-NEXT: ; %bb.169: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX11W64-NEXT: s_branch .LBB3_171 +; GFX11W64-NEXT: .LBB3_170: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_171: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_173 +; GFX11W64-NEXT: ; %bb.172: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX11W64-NEXT: s_branch .LBB3_174 +; GFX11W64-NEXT: .LBB3_173: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_174: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_176 +; GFX11W64-NEXT: ; %bb.175: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX11W64-NEXT: s_branch .LBB3_177 +; GFX11W64-NEXT: .LBB3_176: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_177: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_179 +; GFX11W64-NEXT: ; %bb.178: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX11W64-NEXT: s_branch .LBB3_180 +; GFX11W64-NEXT: .LBB3_179: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_180: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_182 +; GFX11W64-NEXT: ; %bb.181: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX11W64-NEXT: s_branch .LBB3_183 +; GFX11W64-NEXT: .LBB3_182: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_183: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_185 +; GFX11W64-NEXT: ; %bb.184: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX11W64-NEXT: s_branch .LBB3_186 +; GFX11W64-NEXT: .LBB3_185: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_186: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_188 +; GFX11W64-NEXT: ; %bb.187: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX11W64-NEXT: s_branch .LBB3_189 +; GFX11W64-NEXT: .LBB3_188: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_189: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_191 +; GFX11W64-NEXT: ; %bb.190: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX11W64-NEXT: s_branch .LBB3_192 +; GFX11W64-NEXT: .LBB3_191: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB3_192: +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB3_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_cbranch_execz .LBB3_194 +; GFX11W64-NEXT: ; %bb.193: ; GFX11W64-NEXT: s_clause 0x1 -; GFX11W64-NEXT: s_load_b32 s5, s[0:1], 0x44 +; GFX11W64-NEXT: s_load_b32 s12, s[0:1], 0x44 ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_add_i32 s4, s6, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: v_mov_b32_e32 v5, s5 -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v5, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB3_2: +; GFX11W64-NEXT: v_mov_b32_e32 v2, s12 +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB3_194: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: struct_add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB3_2 +; GFX11W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX11W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W32-NEXT: s_branch .LBB3_3 +; GFX11W32-NEXT: .LBB3_2: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_3: +; GFX11W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_5 +; GFX11W32-NEXT: ; %bb.4: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX11W32-NEXT: s_branch .LBB3_6 +; GFX11W32-NEXT: .LBB3_5: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_6: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_8 +; GFX11W32-NEXT: ; %bb.7: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX11W32-NEXT: s_branch .LBB3_9 +; GFX11W32-NEXT: .LBB3_8: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_9: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_11 +; GFX11W32-NEXT: ; %bb.10: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX11W32-NEXT: s_branch .LBB3_12 +; GFX11W32-NEXT: .LBB3_11: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_12: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_14 +; GFX11W32-NEXT: ; %bb.13: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX11W32-NEXT: s_branch .LBB3_15 +; GFX11W32-NEXT: .LBB3_14: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_15: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_17 +; GFX11W32-NEXT: ; %bb.16: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX11W32-NEXT: s_branch .LBB3_18 +; GFX11W32-NEXT: .LBB3_17: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_18: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_20 +; GFX11W32-NEXT: ; %bb.19: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX11W32-NEXT: s_branch .LBB3_21 +; GFX11W32-NEXT: .LBB3_20: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_21: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_23 +; GFX11W32-NEXT: ; %bb.22: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX11W32-NEXT: s_branch .LBB3_24 +; GFX11W32-NEXT: .LBB3_23: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_24: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_26 +; GFX11W32-NEXT: ; %bb.25: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX11W32-NEXT: s_branch .LBB3_27 +; GFX11W32-NEXT: .LBB3_26: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_27: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_29 +; GFX11W32-NEXT: ; %bb.28: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX11W32-NEXT: s_branch .LBB3_30 +; GFX11W32-NEXT: .LBB3_29: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_30: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_32 +; GFX11W32-NEXT: ; %bb.31: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX11W32-NEXT: s_branch .LBB3_33 +; GFX11W32-NEXT: .LBB3_32: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_33: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_35 +; GFX11W32-NEXT: ; %bb.34: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX11W32-NEXT: s_branch .LBB3_36 +; GFX11W32-NEXT: .LBB3_35: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_36: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_38 +; GFX11W32-NEXT: ; %bb.37: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX11W32-NEXT: s_branch .LBB3_39 +; GFX11W32-NEXT: .LBB3_38: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_39: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_41 +; GFX11W32-NEXT: ; %bb.40: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX11W32-NEXT: s_branch .LBB3_42 +; GFX11W32-NEXT: .LBB3_41: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_42: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_44 +; GFX11W32-NEXT: ; %bb.43: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX11W32-NEXT: s_branch .LBB3_45 +; GFX11W32-NEXT: .LBB3_44: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_45: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_47 +; GFX11W32-NEXT: ; %bb.46: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX11W32-NEXT: s_branch .LBB3_48 +; GFX11W32-NEXT: .LBB3_47: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_48: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_50 +; GFX11W32-NEXT: ; %bb.49: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX11W32-NEXT: s_branch .LBB3_51 +; GFX11W32-NEXT: .LBB3_50: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_51: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_53 +; GFX11W32-NEXT: ; %bb.52: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX11W32-NEXT: s_branch .LBB3_54 +; GFX11W32-NEXT: .LBB3_53: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_54: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_56 +; GFX11W32-NEXT: ; %bb.55: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX11W32-NEXT: s_branch .LBB3_57 +; GFX11W32-NEXT: .LBB3_56: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_57: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_59 +; GFX11W32-NEXT: ; %bb.58: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX11W32-NEXT: s_branch .LBB3_60 +; GFX11W32-NEXT: .LBB3_59: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_60: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_62 +; GFX11W32-NEXT: ; %bb.61: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX11W32-NEXT: s_branch .LBB3_63 +; GFX11W32-NEXT: .LBB3_62: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_63: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_65 +; GFX11W32-NEXT: ; %bb.64: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX11W32-NEXT: s_branch .LBB3_66 +; GFX11W32-NEXT: .LBB3_65: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_66: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_68 +; GFX11W32-NEXT: ; %bb.67: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX11W32-NEXT: s_branch .LBB3_69 +; GFX11W32-NEXT: .LBB3_68: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_69: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_71 +; GFX11W32-NEXT: ; %bb.70: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX11W32-NEXT: s_branch .LBB3_72 +; GFX11W32-NEXT: .LBB3_71: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_72: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_74 +; GFX11W32-NEXT: ; %bb.73: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX11W32-NEXT: s_branch .LBB3_75 +; GFX11W32-NEXT: .LBB3_74: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_75: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_77 +; GFX11W32-NEXT: ; %bb.76: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX11W32-NEXT: s_branch .LBB3_78 +; GFX11W32-NEXT: .LBB3_77: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_78: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_80 +; GFX11W32-NEXT: ; %bb.79: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX11W32-NEXT: s_branch .LBB3_81 +; GFX11W32-NEXT: .LBB3_80: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_81: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_83 +; GFX11W32-NEXT: ; %bb.82: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX11W32-NEXT: s_branch .LBB3_84 +; GFX11W32-NEXT: .LBB3_83: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_84: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_86 +; GFX11W32-NEXT: ; %bb.85: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX11W32-NEXT: s_branch .LBB3_87 +; GFX11W32-NEXT: .LBB3_86: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_87: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_89 +; GFX11W32-NEXT: ; %bb.88: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX11W32-NEXT: s_branch .LBB3_90 +; GFX11W32-NEXT: .LBB3_89: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_90: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_92 +; GFX11W32-NEXT: ; %bb.91: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX11W32-NEXT: s_branch .LBB3_93 +; GFX11W32-NEXT: .LBB3_92: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_93: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_95 +; GFX11W32-NEXT: ; %bb.94: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX11W32-NEXT: s_branch .LBB3_96 +; GFX11W32-NEXT: .LBB3_95: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB3_96: +; GFX11W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB3_98 +; GFX11W32-NEXT: ; %bb.97: ; GFX11W32-NEXT: s_clause 0x1 -; GFX11W32-NEXT: s_load_b32 s8, s[0:1], 0x44 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 +; GFX11W32-NEXT: s_load_b32 s6, s[0:1], 0x44 +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s8 -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v5, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB3_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: v_mov_b32_e32 v2, s6 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W32-NEXT: .LBB3_98: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1629,313 +11056,5027 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB7_3 +; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB7_6 +; GFX8-NEXT: .LBB7_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB7_9 +; GFX8-NEXT: .LBB7_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB7_12 +; GFX8-NEXT: .LBB7_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB7_15 +; GFX8-NEXT: .LBB7_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB7_18 +; GFX8-NEXT: .LBB7_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB7_21 +; GFX8-NEXT: .LBB7_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB7_24 +; GFX8-NEXT: .LBB7_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB7_27 +; GFX8-NEXT: .LBB7_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB7_30 +; GFX8-NEXT: .LBB7_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB7_33 +; GFX8-NEXT: .LBB7_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB7_36 +; GFX8-NEXT: .LBB7_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB7_39 +; GFX8-NEXT: .LBB7_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB7_42 +; GFX8-NEXT: .LBB7_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB7_45 +; GFX8-NEXT: .LBB7_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB7_48 +; GFX8-NEXT: .LBB7_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB7_51 +; GFX8-NEXT: .LBB7_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB7_54 +; GFX8-NEXT: .LBB7_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB7_57 +; GFX8-NEXT: .LBB7_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB7_60 +; GFX8-NEXT: .LBB7_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB7_63 +; GFX8-NEXT: .LBB7_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB7_66 +; GFX8-NEXT: .LBB7_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB7_69 +; GFX8-NEXT: .LBB7_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB7_72 +; GFX8-NEXT: .LBB7_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB7_75 +; GFX8-NEXT: .LBB7_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB7_78 +; GFX8-NEXT: .LBB7_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB7_81 +; GFX8-NEXT: .LBB7_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB7_84 +; GFX8-NEXT: .LBB7_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB7_87 +; GFX8-NEXT: .LBB7_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB7_90 +; GFX8-NEXT: .LBB7_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB7_93 +; GFX8-NEXT: .LBB7_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB7_96 +; GFX8-NEXT: .LBB7_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB7_99 +; GFX8-NEXT: .LBB7_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB7_102 +; GFX8-NEXT: .LBB7_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB7_105 +; GFX8-NEXT: .LBB7_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB7_108 +; GFX8-NEXT: .LBB7_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB7_111 +; GFX8-NEXT: .LBB7_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB7_114 +; GFX8-NEXT: .LBB7_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB7_117 +; GFX8-NEXT: .LBB7_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB7_120 +; GFX8-NEXT: .LBB7_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB7_123 +; GFX8-NEXT: .LBB7_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB7_126 +; GFX8-NEXT: .LBB7_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB7_129 +; GFX8-NEXT: .LBB7_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB7_132 +; GFX8-NEXT: .LBB7_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB7_135 +; GFX8-NEXT: .LBB7_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB7_138 +; GFX8-NEXT: .LBB7_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB7_141 +; GFX8-NEXT: .LBB7_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB7_144 +; GFX8-NEXT: .LBB7_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB7_147 +; GFX8-NEXT: .LBB7_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB7_150 +; GFX8-NEXT: .LBB7_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB7_153 +; GFX8-NEXT: .LBB7_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB7_156 +; GFX8-NEXT: .LBB7_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB7_159 +; GFX8-NEXT: .LBB7_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB7_162 +; GFX8-NEXT: .LBB7_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB7_165 +; GFX8-NEXT: .LBB7_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB7_168 +; GFX8-NEXT: .LBB7_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB7_171 +; GFX8-NEXT: .LBB7_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB7_174 +; GFX8-NEXT: .LBB7_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB7_177 +; GFX8-NEXT: .LBB7_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB7_180 +; GFX8-NEXT: .LBB7_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB7_183 +; GFX8-NEXT: .LBB7_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB7_186 +; GFX8-NEXT: .LBB7_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB7_189 +; GFX8-NEXT: .LBB7_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB7_192 +; GFX8-NEXT: .LBB7_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_cbranch_execz .LBB7_194 +; GFX8-NEXT: ; %bb.193: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: .LBB7_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB7_3 +; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB7_6 +; GFX9-NEXT: .LBB7_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB7_9 +; GFX9-NEXT: .LBB7_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB7_12 +; GFX9-NEXT: .LBB7_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB7_15 +; GFX9-NEXT: .LBB7_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB7_18 +; GFX9-NEXT: .LBB7_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB7_21 +; GFX9-NEXT: .LBB7_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB7_24 +; GFX9-NEXT: .LBB7_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB7_27 +; GFX9-NEXT: .LBB7_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB7_30 +; GFX9-NEXT: .LBB7_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB7_33 +; GFX9-NEXT: .LBB7_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB7_36 +; GFX9-NEXT: .LBB7_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB7_39 +; GFX9-NEXT: .LBB7_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB7_42 +; GFX9-NEXT: .LBB7_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB7_45 +; GFX9-NEXT: .LBB7_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB7_48 +; GFX9-NEXT: .LBB7_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB7_51 +; GFX9-NEXT: .LBB7_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB7_54 +; GFX9-NEXT: .LBB7_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB7_57 +; GFX9-NEXT: .LBB7_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB7_60 +; GFX9-NEXT: .LBB7_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB7_63 +; GFX9-NEXT: .LBB7_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB7_66 +; GFX9-NEXT: .LBB7_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB7_69 +; GFX9-NEXT: .LBB7_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB7_72 +; GFX9-NEXT: .LBB7_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB7_75 +; GFX9-NEXT: .LBB7_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB7_78 +; GFX9-NEXT: .LBB7_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB7_81 +; GFX9-NEXT: .LBB7_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB7_84 +; GFX9-NEXT: .LBB7_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB7_87 +; GFX9-NEXT: .LBB7_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB7_90 +; GFX9-NEXT: .LBB7_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB7_93 +; GFX9-NEXT: .LBB7_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB7_96 +; GFX9-NEXT: .LBB7_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB7_99 +; GFX9-NEXT: .LBB7_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB7_102 +; GFX9-NEXT: .LBB7_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB7_105 +; GFX9-NEXT: .LBB7_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB7_108 +; GFX9-NEXT: .LBB7_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB7_111 +; GFX9-NEXT: .LBB7_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB7_114 +; GFX9-NEXT: .LBB7_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB7_117 +; GFX9-NEXT: .LBB7_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB7_120 +; GFX9-NEXT: .LBB7_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB7_123 +; GFX9-NEXT: .LBB7_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB7_126 +; GFX9-NEXT: .LBB7_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB7_129 +; GFX9-NEXT: .LBB7_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB7_132 +; GFX9-NEXT: .LBB7_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB7_135 +; GFX9-NEXT: .LBB7_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB7_138 +; GFX9-NEXT: .LBB7_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB7_141 +; GFX9-NEXT: .LBB7_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB7_144 +; GFX9-NEXT: .LBB7_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB7_147 +; GFX9-NEXT: .LBB7_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB7_150 +; GFX9-NEXT: .LBB7_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB7_153 +; GFX9-NEXT: .LBB7_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB7_156 +; GFX9-NEXT: .LBB7_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB7_159 +; GFX9-NEXT: .LBB7_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB7_162 +; GFX9-NEXT: .LBB7_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB7_165 +; GFX9-NEXT: .LBB7_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB7_168 +; GFX9-NEXT: .LBB7_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB7_171 +; GFX9-NEXT: .LBB7_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB7_174 +; GFX9-NEXT: .LBB7_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB7_177 +; GFX9-NEXT: .LBB7_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB7_180 +; GFX9-NEXT: .LBB7_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB7_183 +; GFX9-NEXT: .LBB7_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB7_186 +; GFX9-NEXT: .LBB7_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB7_189 +; GFX9-NEXT: .LBB7_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB7_192 +; GFX9-NEXT: .LBB7_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_cbranch_execz .LBB7_194 +; GFX9-NEXT: ; %bb.193: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: .LBB7_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX10W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W64-NEXT: s_branch .LBB7_3 +; GFX10W64-NEXT: .LBB7_2: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_3: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX10W64-NEXT: ; %bb.4: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX10W64-NEXT: s_branch .LBB7_6 +; GFX10W64-NEXT: .LBB7_5: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_6: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX10W64-NEXT: ; %bb.7: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX10W64-NEXT: s_branch .LBB7_9 +; GFX10W64-NEXT: .LBB7_8: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_9: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX10W64-NEXT: ; %bb.10: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX10W64-NEXT: s_branch .LBB7_12 +; GFX10W64-NEXT: .LBB7_11: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_12: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX10W64-NEXT: ; %bb.13: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX10W64-NEXT: s_branch .LBB7_15 +; GFX10W64-NEXT: .LBB7_14: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_15: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX10W64-NEXT: ; %bb.16: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX10W64-NEXT: s_branch .LBB7_18 +; GFX10W64-NEXT: .LBB7_17: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_18: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX10W64-NEXT: ; %bb.19: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX10W64-NEXT: s_branch .LBB7_21 +; GFX10W64-NEXT: .LBB7_20: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_21: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX10W64-NEXT: ; %bb.22: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX10W64-NEXT: s_branch .LBB7_24 +; GFX10W64-NEXT: .LBB7_23: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_24: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX10W64-NEXT: ; %bb.25: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX10W64-NEXT: s_branch .LBB7_27 +; GFX10W64-NEXT: .LBB7_26: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_27: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX10W64-NEXT: ; %bb.28: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX10W64-NEXT: s_branch .LBB7_30 +; GFX10W64-NEXT: .LBB7_29: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_30: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX10W64-NEXT: ; %bb.31: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX10W64-NEXT: s_branch .LBB7_33 +; GFX10W64-NEXT: .LBB7_32: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_33: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX10W64-NEXT: ; %bb.34: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX10W64-NEXT: s_branch .LBB7_36 +; GFX10W64-NEXT: .LBB7_35: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_36: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX10W64-NEXT: ; %bb.37: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX10W64-NEXT: s_branch .LBB7_39 +; GFX10W64-NEXT: .LBB7_38: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_39: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX10W64-NEXT: ; %bb.40: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX10W64-NEXT: s_branch .LBB7_42 +; GFX10W64-NEXT: .LBB7_41: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_42: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX10W64-NEXT: ; %bb.43: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX10W64-NEXT: s_branch .LBB7_45 +; GFX10W64-NEXT: .LBB7_44: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_45: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX10W64-NEXT: ; %bb.46: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX10W64-NEXT: s_branch .LBB7_48 +; GFX10W64-NEXT: .LBB7_47: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_48: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX10W64-NEXT: ; %bb.49: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX10W64-NEXT: s_branch .LBB7_51 +; GFX10W64-NEXT: .LBB7_50: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_51: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX10W64-NEXT: ; %bb.52: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX10W64-NEXT: s_branch .LBB7_54 +; GFX10W64-NEXT: .LBB7_53: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_54: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX10W64-NEXT: ; %bb.55: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX10W64-NEXT: s_branch .LBB7_57 +; GFX10W64-NEXT: .LBB7_56: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_57: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX10W64-NEXT: ; %bb.58: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX10W64-NEXT: s_branch .LBB7_60 +; GFX10W64-NEXT: .LBB7_59: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_60: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX10W64-NEXT: ; %bb.61: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX10W64-NEXT: s_branch .LBB7_63 +; GFX10W64-NEXT: .LBB7_62: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_63: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX10W64-NEXT: ; %bb.64: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX10W64-NEXT: s_branch .LBB7_66 +; GFX10W64-NEXT: .LBB7_65: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_66: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX10W64-NEXT: ; %bb.67: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX10W64-NEXT: s_branch .LBB7_69 +; GFX10W64-NEXT: .LBB7_68: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_69: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX10W64-NEXT: ; %bb.70: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX10W64-NEXT: s_branch .LBB7_72 +; GFX10W64-NEXT: .LBB7_71: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_72: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX10W64-NEXT: ; %bb.73: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX10W64-NEXT: s_branch .LBB7_75 +; GFX10W64-NEXT: .LBB7_74: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_75: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX10W64-NEXT: ; %bb.76: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX10W64-NEXT: s_branch .LBB7_78 +; GFX10W64-NEXT: .LBB7_77: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_78: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX10W64-NEXT: ; %bb.79: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX10W64-NEXT: s_branch .LBB7_81 +; GFX10W64-NEXT: .LBB7_80: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_81: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX10W64-NEXT: ; %bb.82: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX10W64-NEXT: s_branch .LBB7_84 +; GFX10W64-NEXT: .LBB7_83: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_84: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX10W64-NEXT: ; %bb.85: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX10W64-NEXT: s_branch .LBB7_87 +; GFX10W64-NEXT: .LBB7_86: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_87: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX10W64-NEXT: ; %bb.88: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX10W64-NEXT: s_branch .LBB7_90 +; GFX10W64-NEXT: .LBB7_89: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_90: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX10W64-NEXT: s_add_i32 s4, s6, s2 +; GFX10W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX10W64-NEXT: s_mov_b32 s7, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX10W64-NEXT: ; %bb.91: +; GFX10W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX10W64-NEXT: s_branch .LBB7_93 +; GFX10W64-NEXT: .LBB7_92: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_93: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s5, 0 +; GFX10W64-NEXT: s_add_i32 s6, s4, s2 +; GFX10W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX10W64-NEXT: ; %bb.94: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX10W64-NEXT: s_branch .LBB7_96 +; GFX10W64-NEXT: .LBB7_95: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_96: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_98 +; GFX10W64-NEXT: ; %bb.97: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX10W64-NEXT: s_branch .LBB7_99 +; GFX10W64-NEXT: .LBB7_98: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_99: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_101 +; GFX10W64-NEXT: ; %bb.100: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX10W64-NEXT: s_branch .LBB7_102 +; GFX10W64-NEXT: .LBB7_101: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_102: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_104 +; GFX10W64-NEXT: ; %bb.103: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX10W64-NEXT: s_branch .LBB7_105 +; GFX10W64-NEXT: .LBB7_104: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_105: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_107 +; GFX10W64-NEXT: ; %bb.106: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX10W64-NEXT: s_branch .LBB7_108 +; GFX10W64-NEXT: .LBB7_107: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_108: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_110 +; GFX10W64-NEXT: ; %bb.109: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX10W64-NEXT: s_branch .LBB7_111 +; GFX10W64-NEXT: .LBB7_110: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_111: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_113 +; GFX10W64-NEXT: ; %bb.112: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX10W64-NEXT: s_branch .LBB7_114 +; GFX10W64-NEXT: .LBB7_113: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_114: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_116 +; GFX10W64-NEXT: ; %bb.115: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX10W64-NEXT: s_branch .LBB7_117 +; GFX10W64-NEXT: .LBB7_116: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_117: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_119 +; GFX10W64-NEXT: ; %bb.118: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX10W64-NEXT: s_branch .LBB7_120 +; GFX10W64-NEXT: .LBB7_119: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_120: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_122 +; GFX10W64-NEXT: ; %bb.121: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX10W64-NEXT: s_branch .LBB7_123 +; GFX10W64-NEXT: .LBB7_122: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_123: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_125 +; GFX10W64-NEXT: ; %bb.124: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX10W64-NEXT: s_branch .LBB7_126 +; GFX10W64-NEXT: .LBB7_125: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_126: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_128 +; GFX10W64-NEXT: ; %bb.127: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX10W64-NEXT: s_branch .LBB7_129 +; GFX10W64-NEXT: .LBB7_128: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_129: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_131 +; GFX10W64-NEXT: ; %bb.130: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX10W64-NEXT: s_branch .LBB7_132 +; GFX10W64-NEXT: .LBB7_131: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_132: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_134 +; GFX10W64-NEXT: ; %bb.133: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX10W64-NEXT: s_branch .LBB7_135 +; GFX10W64-NEXT: .LBB7_134: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_135: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_137 +; GFX10W64-NEXT: ; %bb.136: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX10W64-NEXT: s_branch .LBB7_138 +; GFX10W64-NEXT: .LBB7_137: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_138: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_140 +; GFX10W64-NEXT: ; %bb.139: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX10W64-NEXT: s_branch .LBB7_141 +; GFX10W64-NEXT: .LBB7_140: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_141: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_143 +; GFX10W64-NEXT: ; %bb.142: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX10W64-NEXT: s_branch .LBB7_144 +; GFX10W64-NEXT: .LBB7_143: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_144: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_146 +; GFX10W64-NEXT: ; %bb.145: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX10W64-NEXT: s_branch .LBB7_147 +; GFX10W64-NEXT: .LBB7_146: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_147: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_149 +; GFX10W64-NEXT: ; %bb.148: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX10W64-NEXT: s_branch .LBB7_150 +; GFX10W64-NEXT: .LBB7_149: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_150: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_152 +; GFX10W64-NEXT: ; %bb.151: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX10W64-NEXT: s_branch .LBB7_153 +; GFX10W64-NEXT: .LBB7_152: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_153: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_155 +; GFX10W64-NEXT: ; %bb.154: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX10W64-NEXT: s_branch .LBB7_156 +; GFX10W64-NEXT: .LBB7_155: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_156: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_158 +; GFX10W64-NEXT: ; %bb.157: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX10W64-NEXT: s_branch .LBB7_159 +; GFX10W64-NEXT: .LBB7_158: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_159: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_161 +; GFX10W64-NEXT: ; %bb.160: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX10W64-NEXT: s_branch .LBB7_162 +; GFX10W64-NEXT: .LBB7_161: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_162: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_164 +; GFX10W64-NEXT: ; %bb.163: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX10W64-NEXT: s_branch .LBB7_165 +; GFX10W64-NEXT: .LBB7_164: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_165: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_167 +; GFX10W64-NEXT: ; %bb.166: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX10W64-NEXT: s_branch .LBB7_168 +; GFX10W64-NEXT: .LBB7_167: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_168: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_170 +; GFX10W64-NEXT: ; %bb.169: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX10W64-NEXT: s_branch .LBB7_171 +; GFX10W64-NEXT: .LBB7_170: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_171: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_173 +; GFX10W64-NEXT: ; %bb.172: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX10W64-NEXT: s_branch .LBB7_174 +; GFX10W64-NEXT: .LBB7_173: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_174: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_176 +; GFX10W64-NEXT: ; %bb.175: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX10W64-NEXT: s_branch .LBB7_177 +; GFX10W64-NEXT: .LBB7_176: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_177: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_179 +; GFX10W64-NEXT: ; %bb.178: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX10W64-NEXT: s_branch .LBB7_180 +; GFX10W64-NEXT: .LBB7_179: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_180: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_182 +; GFX10W64-NEXT: ; %bb.181: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX10W64-NEXT: s_branch .LBB7_183 +; GFX10W64-NEXT: .LBB7_182: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_183: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_185 +; GFX10W64-NEXT: ; %bb.184: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX10W64-NEXT: s_branch .LBB7_186 +; GFX10W64-NEXT: .LBB7_185: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_186: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_188 +; GFX10W64-NEXT: ; %bb.187: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX10W64-NEXT: s_branch .LBB7_189 +; GFX10W64-NEXT: .LBB7_188: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_189: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_191 +; GFX10W64-NEXT: ; %bb.190: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX10W64-NEXT: s_branch .LBB7_192 +; GFX10W64-NEXT: .LBB7_191: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_192: +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_cbranch_execz .LBB7_194 +; GFX10W64-NEXT: ; %bb.193: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_add_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB7_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB7_194: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 +; GFX10W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX10W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[4:7], 0 glc +; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W32-NEXT: s_branch .LBB7_3 ; GFX10W32-NEXT: .LBB7_2: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_3: +; GFX10W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX10W32-NEXT: ; %bb.4: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX10W32-NEXT: s_branch .LBB7_6 +; GFX10W32-NEXT: .LBB7_5: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_6: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX10W32-NEXT: ; %bb.7: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX10W32-NEXT: s_branch .LBB7_9 +; GFX10W32-NEXT: .LBB7_8: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_9: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX10W32-NEXT: ; %bb.10: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX10W32-NEXT: s_branch .LBB7_12 +; GFX10W32-NEXT: .LBB7_11: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_12: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX10W32-NEXT: ; %bb.13: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX10W32-NEXT: s_branch .LBB7_15 +; GFX10W32-NEXT: .LBB7_14: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_15: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX10W32-NEXT: ; %bb.16: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX10W32-NEXT: s_branch .LBB7_18 +; GFX10W32-NEXT: .LBB7_17: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_18: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX10W32-NEXT: ; %bb.19: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX10W32-NEXT: s_branch .LBB7_21 +; GFX10W32-NEXT: .LBB7_20: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_21: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX10W32-NEXT: ; %bb.22: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX10W32-NEXT: s_branch .LBB7_24 +; GFX10W32-NEXT: .LBB7_23: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_24: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX10W32-NEXT: ; %bb.25: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX10W32-NEXT: s_branch .LBB7_27 +; GFX10W32-NEXT: .LBB7_26: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_27: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX10W32-NEXT: ; %bb.28: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX10W32-NEXT: s_branch .LBB7_30 +; GFX10W32-NEXT: .LBB7_29: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_30: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX10W32-NEXT: ; %bb.31: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX10W32-NEXT: s_branch .LBB7_33 +; GFX10W32-NEXT: .LBB7_32: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_33: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX10W32-NEXT: ; %bb.34: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX10W32-NEXT: s_branch .LBB7_36 +; GFX10W32-NEXT: .LBB7_35: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_36: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX10W32-NEXT: ; %bb.37: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX10W32-NEXT: s_branch .LBB7_39 +; GFX10W32-NEXT: .LBB7_38: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_39: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX10W32-NEXT: ; %bb.40: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX10W32-NEXT: s_branch .LBB7_42 +; GFX10W32-NEXT: .LBB7_41: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_42: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX10W32-NEXT: ; %bb.43: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX10W32-NEXT: s_branch .LBB7_45 +; GFX10W32-NEXT: .LBB7_44: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_45: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX10W32-NEXT: ; %bb.46: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX10W32-NEXT: s_branch .LBB7_48 +; GFX10W32-NEXT: .LBB7_47: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_48: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX10W32-NEXT: ; %bb.49: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX10W32-NEXT: s_branch .LBB7_51 +; GFX10W32-NEXT: .LBB7_50: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_51: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX10W32-NEXT: ; %bb.52: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX10W32-NEXT: s_branch .LBB7_54 +; GFX10W32-NEXT: .LBB7_53: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_54: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX10W32-NEXT: ; %bb.55: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX10W32-NEXT: s_branch .LBB7_57 +; GFX10W32-NEXT: .LBB7_56: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_57: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX10W32-NEXT: ; %bb.58: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX10W32-NEXT: s_branch .LBB7_60 +; GFX10W32-NEXT: .LBB7_59: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_60: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX10W32-NEXT: ; %bb.61: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX10W32-NEXT: s_branch .LBB7_63 +; GFX10W32-NEXT: .LBB7_62: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_63: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX10W32-NEXT: ; %bb.64: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX10W32-NEXT: s_branch .LBB7_66 +; GFX10W32-NEXT: .LBB7_65: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_66: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX10W32-NEXT: ; %bb.67: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX10W32-NEXT: s_branch .LBB7_69 +; GFX10W32-NEXT: .LBB7_68: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_69: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX10W32-NEXT: ; %bb.70: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX10W32-NEXT: s_branch .LBB7_72 +; GFX10W32-NEXT: .LBB7_71: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_72: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX10W32-NEXT: ; %bb.73: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX10W32-NEXT: s_branch .LBB7_75 +; GFX10W32-NEXT: .LBB7_74: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_75: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX10W32-NEXT: ; %bb.76: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX10W32-NEXT: s_branch .LBB7_78 +; GFX10W32-NEXT: .LBB7_77: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_78: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX10W32-NEXT: ; %bb.79: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX10W32-NEXT: s_branch .LBB7_81 +; GFX10W32-NEXT: .LBB7_80: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_81: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX10W32-NEXT: ; %bb.82: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX10W32-NEXT: s_branch .LBB7_84 +; GFX10W32-NEXT: .LBB7_83: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_84: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX10W32-NEXT: ; %bb.85: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX10W32-NEXT: s_branch .LBB7_87 +; GFX10W32-NEXT: .LBB7_86: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_87: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX10W32-NEXT: ; %bb.88: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX10W32-NEXT: s_branch .LBB7_90 +; GFX10W32-NEXT: .LBB7_89: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_90: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX10W32-NEXT: ; %bb.91: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX10W32-NEXT: s_branch .LBB7_93 +; GFX10W32-NEXT: .LBB7_92: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_93: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX10W32-NEXT: ; %bb.94: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX10W32-NEXT: s_branch .LBB7_96 +; GFX10W32-NEXT: .LBB7_95: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_96: +; GFX10W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_cbranch_execz .LBB7_98 +; GFX10W32-NEXT: ; %bb.97: +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: .LBB7_98: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX11W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W64-NEXT: s_branch .LBB7_3 +; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_3: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX11W64-NEXT: ; %bb.4: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX11W64-NEXT: s_branch .LBB7_6 +; GFX11W64-NEXT: .LBB7_5: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_6: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX11W64-NEXT: ; %bb.7: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX11W64-NEXT: s_branch .LBB7_9 +; GFX11W64-NEXT: .LBB7_8: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_9: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX11W64-NEXT: ; %bb.10: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX11W64-NEXT: s_branch .LBB7_12 +; GFX11W64-NEXT: .LBB7_11: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_12: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX11W64-NEXT: ; %bb.13: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX11W64-NEXT: s_branch .LBB7_15 +; GFX11W64-NEXT: .LBB7_14: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_15: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX11W64-NEXT: ; %bb.16: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX11W64-NEXT: s_branch .LBB7_18 +; GFX11W64-NEXT: .LBB7_17: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_18: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX11W64-NEXT: ; %bb.19: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX11W64-NEXT: s_branch .LBB7_21 +; GFX11W64-NEXT: .LBB7_20: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_21: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX11W64-NEXT: ; %bb.22: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX11W64-NEXT: s_branch .LBB7_24 +; GFX11W64-NEXT: .LBB7_23: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_24: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX11W64-NEXT: ; %bb.25: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX11W64-NEXT: s_branch .LBB7_27 +; GFX11W64-NEXT: .LBB7_26: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_27: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX11W64-NEXT: ; %bb.28: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX11W64-NEXT: s_branch .LBB7_30 +; GFX11W64-NEXT: .LBB7_29: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_30: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX11W64-NEXT: ; %bb.31: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX11W64-NEXT: s_branch .LBB7_33 +; GFX11W64-NEXT: .LBB7_32: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_33: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX11W64-NEXT: ; %bb.34: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX11W64-NEXT: s_branch .LBB7_36 +; GFX11W64-NEXT: .LBB7_35: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_36: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX11W64-NEXT: ; %bb.37: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX11W64-NEXT: s_branch .LBB7_39 +; GFX11W64-NEXT: .LBB7_38: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_39: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX11W64-NEXT: ; %bb.40: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX11W64-NEXT: s_branch .LBB7_42 +; GFX11W64-NEXT: .LBB7_41: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_42: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX11W64-NEXT: ; %bb.43: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX11W64-NEXT: s_branch .LBB7_45 +; GFX11W64-NEXT: .LBB7_44: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_45: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX11W64-NEXT: ; %bb.46: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX11W64-NEXT: s_branch .LBB7_48 +; GFX11W64-NEXT: .LBB7_47: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_48: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX11W64-NEXT: ; %bb.49: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX11W64-NEXT: s_branch .LBB7_51 +; GFX11W64-NEXT: .LBB7_50: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_51: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX11W64-NEXT: ; %bb.52: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX11W64-NEXT: s_branch .LBB7_54 +; GFX11W64-NEXT: .LBB7_53: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_54: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX11W64-NEXT: ; %bb.55: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX11W64-NEXT: s_branch .LBB7_57 +; GFX11W64-NEXT: .LBB7_56: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_57: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX11W64-NEXT: ; %bb.58: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX11W64-NEXT: s_branch .LBB7_60 +; GFX11W64-NEXT: .LBB7_59: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_60: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX11W64-NEXT: ; %bb.61: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX11W64-NEXT: s_branch .LBB7_63 +; GFX11W64-NEXT: .LBB7_62: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_63: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX11W64-NEXT: ; %bb.64: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX11W64-NEXT: s_branch .LBB7_66 +; GFX11W64-NEXT: .LBB7_65: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_66: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX11W64-NEXT: ; %bb.67: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX11W64-NEXT: s_branch .LBB7_69 +; GFX11W64-NEXT: .LBB7_68: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_69: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX11W64-NEXT: ; %bb.70: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX11W64-NEXT: s_branch .LBB7_72 +; GFX11W64-NEXT: .LBB7_71: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_72: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX11W64-NEXT: ; %bb.73: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX11W64-NEXT: s_branch .LBB7_75 +; GFX11W64-NEXT: .LBB7_74: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_75: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX11W64-NEXT: ; %bb.76: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX11W64-NEXT: s_branch .LBB7_78 +; GFX11W64-NEXT: .LBB7_77: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_78: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX11W64-NEXT: ; %bb.79: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX11W64-NEXT: s_branch .LBB7_81 +; GFX11W64-NEXT: .LBB7_80: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_81: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX11W64-NEXT: ; %bb.82: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX11W64-NEXT: s_branch .LBB7_84 +; GFX11W64-NEXT: .LBB7_83: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_84: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX11W64-NEXT: ; %bb.85: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX11W64-NEXT: s_branch .LBB7_87 +; GFX11W64-NEXT: .LBB7_86: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_87: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX11W64-NEXT: ; %bb.88: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX11W64-NEXT: s_branch .LBB7_90 +; GFX11W64-NEXT: .LBB7_89: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_90: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX11W64-NEXT: s_add_i32 s4, s6, s2 +; GFX11W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX11W64-NEXT: s_mov_b32 s7, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX11W64-NEXT: ; %bb.91: +; GFX11W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX11W64-NEXT: s_branch .LBB7_93 +; GFX11W64-NEXT: .LBB7_92: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_93: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s5, 0 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: s_add_i32 s6, s4, s2 +; GFX11W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX11W64-NEXT: ; %bb.94: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX11W64-NEXT: s_branch .LBB7_96 +; GFX11W64-NEXT: .LBB7_95: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_96: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_98 +; GFX11W64-NEXT: ; %bb.97: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX11W64-NEXT: s_branch .LBB7_99 +; GFX11W64-NEXT: .LBB7_98: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_99: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_101 +; GFX11W64-NEXT: ; %bb.100: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX11W64-NEXT: s_branch .LBB7_102 +; GFX11W64-NEXT: .LBB7_101: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_102: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_104 +; GFX11W64-NEXT: ; %bb.103: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX11W64-NEXT: s_branch .LBB7_105 +; GFX11W64-NEXT: .LBB7_104: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_105: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_107 +; GFX11W64-NEXT: ; %bb.106: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX11W64-NEXT: s_branch .LBB7_108 +; GFX11W64-NEXT: .LBB7_107: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_108: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_110 +; GFX11W64-NEXT: ; %bb.109: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX11W64-NEXT: s_branch .LBB7_111 +; GFX11W64-NEXT: .LBB7_110: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_111: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_113 +; GFX11W64-NEXT: ; %bb.112: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX11W64-NEXT: s_branch .LBB7_114 +; GFX11W64-NEXT: .LBB7_113: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_114: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_116 +; GFX11W64-NEXT: ; %bb.115: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX11W64-NEXT: s_branch .LBB7_117 +; GFX11W64-NEXT: .LBB7_116: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_117: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_119 +; GFX11W64-NEXT: ; %bb.118: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX11W64-NEXT: s_branch .LBB7_120 +; GFX11W64-NEXT: .LBB7_119: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_120: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_122 +; GFX11W64-NEXT: ; %bb.121: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX11W64-NEXT: s_branch .LBB7_123 +; GFX11W64-NEXT: .LBB7_122: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_123: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_125 +; GFX11W64-NEXT: ; %bb.124: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX11W64-NEXT: s_branch .LBB7_126 +; GFX11W64-NEXT: .LBB7_125: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_126: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_128 +; GFX11W64-NEXT: ; %bb.127: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX11W64-NEXT: s_branch .LBB7_129 +; GFX11W64-NEXT: .LBB7_128: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_129: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_131 +; GFX11W64-NEXT: ; %bb.130: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX11W64-NEXT: s_branch .LBB7_132 +; GFX11W64-NEXT: .LBB7_131: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_132: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_134 +; GFX11W64-NEXT: ; %bb.133: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX11W64-NEXT: s_branch .LBB7_135 +; GFX11W64-NEXT: .LBB7_134: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_135: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_137 +; GFX11W64-NEXT: ; %bb.136: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX11W64-NEXT: s_branch .LBB7_138 +; GFX11W64-NEXT: .LBB7_137: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_138: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_140 +; GFX11W64-NEXT: ; %bb.139: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX11W64-NEXT: s_branch .LBB7_141 +; GFX11W64-NEXT: .LBB7_140: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_141: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_143 +; GFX11W64-NEXT: ; %bb.142: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX11W64-NEXT: s_branch .LBB7_144 +; GFX11W64-NEXT: .LBB7_143: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_144: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_146 +; GFX11W64-NEXT: ; %bb.145: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX11W64-NEXT: s_branch .LBB7_147 +; GFX11W64-NEXT: .LBB7_146: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_147: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_149 +; GFX11W64-NEXT: ; %bb.148: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX11W64-NEXT: s_branch .LBB7_150 +; GFX11W64-NEXT: .LBB7_149: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_150: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_152 +; GFX11W64-NEXT: ; %bb.151: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX11W64-NEXT: s_branch .LBB7_153 +; GFX11W64-NEXT: .LBB7_152: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_153: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_155 +; GFX11W64-NEXT: ; %bb.154: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX11W64-NEXT: s_branch .LBB7_156 +; GFX11W64-NEXT: .LBB7_155: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_156: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_158 +; GFX11W64-NEXT: ; %bb.157: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX11W64-NEXT: s_branch .LBB7_159 +; GFX11W64-NEXT: .LBB7_158: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_159: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_161 +; GFX11W64-NEXT: ; %bb.160: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX11W64-NEXT: s_branch .LBB7_162 +; GFX11W64-NEXT: .LBB7_161: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_162: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_164 +; GFX11W64-NEXT: ; %bb.163: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX11W64-NEXT: s_branch .LBB7_165 +; GFX11W64-NEXT: .LBB7_164: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_165: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_167 +; GFX11W64-NEXT: ; %bb.166: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX11W64-NEXT: s_branch .LBB7_168 +; GFX11W64-NEXT: .LBB7_167: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_168: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_170 +; GFX11W64-NEXT: ; %bb.169: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX11W64-NEXT: s_branch .LBB7_171 +; GFX11W64-NEXT: .LBB7_170: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_171: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_173 +; GFX11W64-NEXT: ; %bb.172: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX11W64-NEXT: s_branch .LBB7_174 +; GFX11W64-NEXT: .LBB7_173: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_174: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_176 +; GFX11W64-NEXT: ; %bb.175: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX11W64-NEXT: s_branch .LBB7_177 +; GFX11W64-NEXT: .LBB7_176: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_177: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_179 +; GFX11W64-NEXT: ; %bb.178: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX11W64-NEXT: s_branch .LBB7_180 +; GFX11W64-NEXT: .LBB7_179: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_180: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_182 +; GFX11W64-NEXT: ; %bb.181: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX11W64-NEXT: s_branch .LBB7_183 +; GFX11W64-NEXT: .LBB7_182: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_183: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_185 +; GFX11W64-NEXT: ; %bb.184: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX11W64-NEXT: s_branch .LBB7_186 +; GFX11W64-NEXT: .LBB7_185: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_186: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_188 +; GFX11W64-NEXT: ; %bb.187: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX11W64-NEXT: s_branch .LBB7_189 +; GFX11W64-NEXT: .LBB7_188: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_189: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_191 +; GFX11W64-NEXT: ; %bb.190: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX11W64-NEXT: s_branch .LBB7_192 +; GFX11W64-NEXT: .LBB7_191: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_192: +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_cbranch_execz .LBB7_194 +; GFX11W64-NEXT: ; %bb.193: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_add_i32 s4, s6, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB7_194: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 +; GFX11W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX11W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W32-NEXT: s_branch .LBB7_3 ; GFX11W32-NEXT: .LBB7_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_3: +; GFX11W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX11W32-NEXT: ; %bb.4: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX11W32-NEXT: s_branch .LBB7_6 +; GFX11W32-NEXT: .LBB7_5: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_6: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX11W32-NEXT: ; %bb.7: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX11W32-NEXT: s_branch .LBB7_9 +; GFX11W32-NEXT: .LBB7_8: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_9: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX11W32-NEXT: ; %bb.10: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX11W32-NEXT: s_branch .LBB7_12 +; GFX11W32-NEXT: .LBB7_11: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_12: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX11W32-NEXT: ; %bb.13: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX11W32-NEXT: s_branch .LBB7_15 +; GFX11W32-NEXT: .LBB7_14: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_15: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX11W32-NEXT: ; %bb.16: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX11W32-NEXT: s_branch .LBB7_18 +; GFX11W32-NEXT: .LBB7_17: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_18: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX11W32-NEXT: ; %bb.19: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX11W32-NEXT: s_branch .LBB7_21 +; GFX11W32-NEXT: .LBB7_20: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_21: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX11W32-NEXT: ; %bb.22: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX11W32-NEXT: s_branch .LBB7_24 +; GFX11W32-NEXT: .LBB7_23: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_24: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX11W32-NEXT: ; %bb.25: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX11W32-NEXT: s_branch .LBB7_27 +; GFX11W32-NEXT: .LBB7_26: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_27: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX11W32-NEXT: ; %bb.28: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX11W32-NEXT: s_branch .LBB7_30 +; GFX11W32-NEXT: .LBB7_29: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_30: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX11W32-NEXT: ; %bb.31: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX11W32-NEXT: s_branch .LBB7_33 +; GFX11W32-NEXT: .LBB7_32: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_33: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX11W32-NEXT: ; %bb.34: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX11W32-NEXT: s_branch .LBB7_36 +; GFX11W32-NEXT: .LBB7_35: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_36: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX11W32-NEXT: ; %bb.37: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX11W32-NEXT: s_branch .LBB7_39 +; GFX11W32-NEXT: .LBB7_38: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_39: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX11W32-NEXT: ; %bb.40: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX11W32-NEXT: s_branch .LBB7_42 +; GFX11W32-NEXT: .LBB7_41: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_42: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX11W32-NEXT: ; %bb.43: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX11W32-NEXT: s_branch .LBB7_45 +; GFX11W32-NEXT: .LBB7_44: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_45: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX11W32-NEXT: ; %bb.46: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX11W32-NEXT: s_branch .LBB7_48 +; GFX11W32-NEXT: .LBB7_47: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_48: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX11W32-NEXT: ; %bb.49: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX11W32-NEXT: s_branch .LBB7_51 +; GFX11W32-NEXT: .LBB7_50: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_51: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX11W32-NEXT: ; %bb.52: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX11W32-NEXT: s_branch .LBB7_54 +; GFX11W32-NEXT: .LBB7_53: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_54: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX11W32-NEXT: ; %bb.55: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX11W32-NEXT: s_branch .LBB7_57 +; GFX11W32-NEXT: .LBB7_56: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_57: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX11W32-NEXT: ; %bb.58: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX11W32-NEXT: s_branch .LBB7_60 +; GFX11W32-NEXT: .LBB7_59: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_60: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX11W32-NEXT: ; %bb.61: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX11W32-NEXT: s_branch .LBB7_63 +; GFX11W32-NEXT: .LBB7_62: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_63: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX11W32-NEXT: ; %bb.64: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX11W32-NEXT: s_branch .LBB7_66 +; GFX11W32-NEXT: .LBB7_65: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_66: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX11W32-NEXT: ; %bb.67: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX11W32-NEXT: s_branch .LBB7_69 +; GFX11W32-NEXT: .LBB7_68: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_69: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX11W32-NEXT: ; %bb.70: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX11W32-NEXT: s_branch .LBB7_72 +; GFX11W32-NEXT: .LBB7_71: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_72: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX11W32-NEXT: ; %bb.73: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX11W32-NEXT: s_branch .LBB7_75 +; GFX11W32-NEXT: .LBB7_74: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_75: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX11W32-NEXT: ; %bb.76: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX11W32-NEXT: s_branch .LBB7_78 +; GFX11W32-NEXT: .LBB7_77: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_78: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX11W32-NEXT: ; %bb.79: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX11W32-NEXT: s_branch .LBB7_81 +; GFX11W32-NEXT: .LBB7_80: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_81: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX11W32-NEXT: ; %bb.82: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX11W32-NEXT: s_branch .LBB7_84 +; GFX11W32-NEXT: .LBB7_83: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_84: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX11W32-NEXT: ; %bb.85: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX11W32-NEXT: s_branch .LBB7_87 +; GFX11W32-NEXT: .LBB7_86: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_87: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX11W32-NEXT: ; %bb.88: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX11W32-NEXT: s_branch .LBB7_90 +; GFX11W32-NEXT: .LBB7_89: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_90: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX11W32-NEXT: ; %bb.91: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX11W32-NEXT: s_branch .LBB7_93 +; GFX11W32-NEXT: .LBB7_92: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_93: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX11W32-NEXT: ; %bb.94: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX11W32-NEXT: s_branch .LBB7_96 +; GFX11W32-NEXT: .LBB7_95: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_96: +; GFX11W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB7_98 +; GFX11W32-NEXT: ; %bb.97: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB7_98: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -516,368 +516,5067 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB2_3 +; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB2_6 +; GFX8-NEXT: .LBB2_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB2_9 +; GFX8-NEXT: .LBB2_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB2_12 +; GFX8-NEXT: .LBB2_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB2_15 +; GFX8-NEXT: .LBB2_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB2_18 +; GFX8-NEXT: .LBB2_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB2_21 +; GFX8-NEXT: .LBB2_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB2_24 +; GFX8-NEXT: .LBB2_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB2_27 +; GFX8-NEXT: .LBB2_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB2_30 +; GFX8-NEXT: .LBB2_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB2_33 +; GFX8-NEXT: .LBB2_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB2_36 +; GFX8-NEXT: .LBB2_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB2_39 +; GFX8-NEXT: .LBB2_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB2_42 +; GFX8-NEXT: .LBB2_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB2_45 +; GFX8-NEXT: .LBB2_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB2_48 +; GFX8-NEXT: .LBB2_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB2_51 +; GFX8-NEXT: .LBB2_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB2_54 +; GFX8-NEXT: .LBB2_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB2_57 +; GFX8-NEXT: .LBB2_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB2_60 +; GFX8-NEXT: .LBB2_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB2_63 +; GFX8-NEXT: .LBB2_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB2_66 +; GFX8-NEXT: .LBB2_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB2_69 +; GFX8-NEXT: .LBB2_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB2_72 +; GFX8-NEXT: .LBB2_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB2_75 +; GFX8-NEXT: .LBB2_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB2_78 +; GFX8-NEXT: .LBB2_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB2_81 +; GFX8-NEXT: .LBB2_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB2_84 +; GFX8-NEXT: .LBB2_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB2_87 +; GFX8-NEXT: .LBB2_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB2_90 +; GFX8-NEXT: .LBB2_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB2_93 +; GFX8-NEXT: .LBB2_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB2_96 +; GFX8-NEXT: .LBB2_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB2_99 +; GFX8-NEXT: .LBB2_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB2_102 +; GFX8-NEXT: .LBB2_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB2_105 +; GFX8-NEXT: .LBB2_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB2_108 +; GFX8-NEXT: .LBB2_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB2_111 +; GFX8-NEXT: .LBB2_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB2_114 +; GFX8-NEXT: .LBB2_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB2_117 +; GFX8-NEXT: .LBB2_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB2_120 +; GFX8-NEXT: .LBB2_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB2_123 +; GFX8-NEXT: .LBB2_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB2_126 +; GFX8-NEXT: .LBB2_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB2_129 +; GFX8-NEXT: .LBB2_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB2_132 +; GFX8-NEXT: .LBB2_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB2_135 +; GFX8-NEXT: .LBB2_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB2_138 +; GFX8-NEXT: .LBB2_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB2_141 +; GFX8-NEXT: .LBB2_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB2_144 +; GFX8-NEXT: .LBB2_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB2_147 +; GFX8-NEXT: .LBB2_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB2_150 +; GFX8-NEXT: .LBB2_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB2_153 +; GFX8-NEXT: .LBB2_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB2_156 +; GFX8-NEXT: .LBB2_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB2_159 +; GFX8-NEXT: .LBB2_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB2_162 +; GFX8-NEXT: .LBB2_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB2_165 +; GFX8-NEXT: .LBB2_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB2_168 +; GFX8-NEXT: .LBB2_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB2_171 +; GFX8-NEXT: .LBB2_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB2_174 +; GFX8-NEXT: .LBB2_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB2_177 +; GFX8-NEXT: .LBB2_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB2_180 +; GFX8-NEXT: .LBB2_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB2_183 +; GFX8-NEXT: .LBB2_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB2_186 +; GFX8-NEXT: .LBB2_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s8, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s8, 62 +; GFX8-NEXT: s_branch .LBB2_189 +; GFX8-NEXT: .LBB2_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s8, s8, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s9, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s8, 63 +; GFX8-NEXT: s_branch .LBB2_192 +; GFX8-NEXT: .LBB2_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_192: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_readfirstlane_b32 s6, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_cbranch_execz .LBB2_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s9, 0 +; GFX8-NEXT: s_add_i32 s4, s8, s4 ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 ; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB2_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB2_194: +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB2_3 +; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB2_6 +; GFX9-NEXT: .LBB2_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB2_9 +; GFX9-NEXT: .LBB2_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB2_12 +; GFX9-NEXT: .LBB2_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB2_15 +; GFX9-NEXT: .LBB2_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB2_18 +; GFX9-NEXT: .LBB2_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB2_21 +; GFX9-NEXT: .LBB2_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB2_24 +; GFX9-NEXT: .LBB2_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB2_27 +; GFX9-NEXT: .LBB2_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB2_30 +; GFX9-NEXT: .LBB2_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB2_33 +; GFX9-NEXT: .LBB2_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB2_36 +; GFX9-NEXT: .LBB2_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB2_39 +; GFX9-NEXT: .LBB2_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB2_42 +; GFX9-NEXT: .LBB2_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB2_45 +; GFX9-NEXT: .LBB2_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB2_48 +; GFX9-NEXT: .LBB2_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB2_51 +; GFX9-NEXT: .LBB2_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB2_54 +; GFX9-NEXT: .LBB2_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB2_57 +; GFX9-NEXT: .LBB2_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB2_60 +; GFX9-NEXT: .LBB2_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB2_63 +; GFX9-NEXT: .LBB2_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB2_66 +; GFX9-NEXT: .LBB2_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB2_69 +; GFX9-NEXT: .LBB2_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB2_72 +; GFX9-NEXT: .LBB2_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB2_75 +; GFX9-NEXT: .LBB2_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB2_78 +; GFX9-NEXT: .LBB2_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB2_81 +; GFX9-NEXT: .LBB2_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB2_84 +; GFX9-NEXT: .LBB2_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB2_87 +; GFX9-NEXT: .LBB2_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB2_90 +; GFX9-NEXT: .LBB2_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB2_93 +; GFX9-NEXT: .LBB2_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB2_96 +; GFX9-NEXT: .LBB2_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB2_99 +; GFX9-NEXT: .LBB2_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB2_102 +; GFX9-NEXT: .LBB2_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB2_105 +; GFX9-NEXT: .LBB2_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB2_108 +; GFX9-NEXT: .LBB2_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB2_111 +; GFX9-NEXT: .LBB2_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB2_114 +; GFX9-NEXT: .LBB2_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB2_117 +; GFX9-NEXT: .LBB2_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB2_120 +; GFX9-NEXT: .LBB2_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB2_123 +; GFX9-NEXT: .LBB2_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB2_126 +; GFX9-NEXT: .LBB2_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB2_129 +; GFX9-NEXT: .LBB2_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB2_132 +; GFX9-NEXT: .LBB2_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB2_135 +; GFX9-NEXT: .LBB2_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB2_138 +; GFX9-NEXT: .LBB2_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB2_141 +; GFX9-NEXT: .LBB2_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB2_144 +; GFX9-NEXT: .LBB2_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB2_147 +; GFX9-NEXT: .LBB2_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB2_150 +; GFX9-NEXT: .LBB2_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB2_153 +; GFX9-NEXT: .LBB2_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB2_156 +; GFX9-NEXT: .LBB2_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB2_159 +; GFX9-NEXT: .LBB2_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB2_162 +; GFX9-NEXT: .LBB2_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB2_165 +; GFX9-NEXT: .LBB2_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB2_168 +; GFX9-NEXT: .LBB2_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB2_171 +; GFX9-NEXT: .LBB2_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB2_174 +; GFX9-NEXT: .LBB2_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB2_177 +; GFX9-NEXT: .LBB2_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB2_180 +; GFX9-NEXT: .LBB2_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB2_183 +; GFX9-NEXT: .LBB2_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB2_186 +; GFX9-NEXT: .LBB2_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s8, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s8, 62 +; GFX9-NEXT: s_branch .LBB2_189 +; GFX9-NEXT: .LBB2_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s8, s8, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s9, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s8, 63 +; GFX9-NEXT: s_branch .LBB2_192 +; GFX9-NEXT: .LBB2_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_192: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_cbranch_execz .LBB2_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s9, 0 +; GFX9-NEXT: s_add_i32 s4, s8, s4 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 ; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB2_194: +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1064-NEXT: s_branch .LBB2_3 +; GFX1064-NEXT: .LBB2_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB2_6 +; GFX1064-NEXT: .LBB2_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB2_9 +; GFX1064-NEXT: .LBB2_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB2_12 +; GFX1064-NEXT: .LBB2_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB2_15 +; GFX1064-NEXT: .LBB2_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB2_18 +; GFX1064-NEXT: .LBB2_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB2_21 +; GFX1064-NEXT: .LBB2_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB2_24 +; GFX1064-NEXT: .LBB2_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB2_27 +; GFX1064-NEXT: .LBB2_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB2_30 +; GFX1064-NEXT: .LBB2_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB2_33 +; GFX1064-NEXT: .LBB2_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB2_36 +; GFX1064-NEXT: .LBB2_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB2_39 +; GFX1064-NEXT: .LBB2_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB2_42 +; GFX1064-NEXT: .LBB2_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB2_45 +; GFX1064-NEXT: .LBB2_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB2_48 +; GFX1064-NEXT: .LBB2_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB2_51 +; GFX1064-NEXT: .LBB2_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB2_54 +; GFX1064-NEXT: .LBB2_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB2_57 +; GFX1064-NEXT: .LBB2_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB2_60 +; GFX1064-NEXT: .LBB2_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB2_63 +; GFX1064-NEXT: .LBB2_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB2_66 +; GFX1064-NEXT: .LBB2_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB2_69 +; GFX1064-NEXT: .LBB2_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB2_72 +; GFX1064-NEXT: .LBB2_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB2_75 +; GFX1064-NEXT: .LBB2_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB2_78 +; GFX1064-NEXT: .LBB2_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB2_81 +; GFX1064-NEXT: .LBB2_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB2_84 +; GFX1064-NEXT: .LBB2_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB2_87 +; GFX1064-NEXT: .LBB2_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB2_90 +; GFX1064-NEXT: .LBB2_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_add_i32 s4, s6, s2 +; GFX1064-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB2_93 +; GFX1064-NEXT: .LBB2_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0 +; GFX1064-NEXT: s_add_i32 s6, s4, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, 31 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 31 +; GFX1064-NEXT: s_branch .LBB2_96 +; GFX1064-NEXT: .LBB2_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s3, s4, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB2_99 +; GFX1064-NEXT: .LBB2_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB2_102 +; GFX1064-NEXT: .LBB2_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB2_105 +; GFX1064-NEXT: .LBB2_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB2_108 +; GFX1064-NEXT: .LBB2_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB2_111 +; GFX1064-NEXT: .LBB2_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB2_114 +; GFX1064-NEXT: .LBB2_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB2_117 +; GFX1064-NEXT: .LBB2_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB2_120 +; GFX1064-NEXT: .LBB2_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB2_123 +; GFX1064-NEXT: .LBB2_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB2_126 +; GFX1064-NEXT: .LBB2_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB2_129 +; GFX1064-NEXT: .LBB2_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB2_132 +; GFX1064-NEXT: .LBB2_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB2_135 +; GFX1064-NEXT: .LBB2_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB2_138 +; GFX1064-NEXT: .LBB2_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB2_141 +; GFX1064-NEXT: .LBB2_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB2_144 +; GFX1064-NEXT: .LBB2_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB2_147 +; GFX1064-NEXT: .LBB2_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB2_150 +; GFX1064-NEXT: .LBB2_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB2_153 +; GFX1064-NEXT: .LBB2_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB2_156 +; GFX1064-NEXT: .LBB2_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB2_159 +; GFX1064-NEXT: .LBB2_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB2_162 +; GFX1064-NEXT: .LBB2_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB2_165 +; GFX1064-NEXT: .LBB2_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB2_168 +; GFX1064-NEXT: .LBB2_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB2_171 +; GFX1064-NEXT: .LBB2_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB2_174 +; GFX1064-NEXT: .LBB2_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB2_177 +; GFX1064-NEXT: .LBB2_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB2_180 +; GFX1064-NEXT: .LBB2_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB2_183 +; GFX1064-NEXT: .LBB2_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_add_i32 s8, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s6, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s8, 61 +; GFX1064-NEXT: s_branch .LBB2_186 +; GFX1064-NEXT: .LBB2_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s6, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_add_i32 s8, s8, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s6, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s8, 62 +; GFX1064-NEXT: s_branch .LBB2_189 +; GFX1064-NEXT: .LBB2_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_add_i32 s8, s8, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s9, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s8, 63 +; GFX1064-NEXT: s_branch .LBB2_192 +; GFX1064-NEXT: .LBB2_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_192: ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064-NEXT: s_mov_b32 s4, s9 -; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_readfirstlane_b32 s6, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB2_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s9, 0 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_add_i32 s4, s8, s4 +; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s4, s2 -; GFX1064-NEXT: s_mov_b32 s5, s3 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1064-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: .LBB2_2: +; GFX1064-NEXT: .LBB2_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, s6 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1032-NEXT: s_branch .LBB2_3 +; GFX1032-NEXT: .LBB2_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB2_6 +; GFX1032-NEXT: .LBB2_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB2_9 +; GFX1032-NEXT: .LBB2_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB2_12 +; GFX1032-NEXT: .LBB2_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB2_15 +; GFX1032-NEXT: .LBB2_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB2_18 +; GFX1032-NEXT: .LBB2_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB2_21 +; GFX1032-NEXT: .LBB2_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB2_24 +; GFX1032-NEXT: .LBB2_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB2_27 +; GFX1032-NEXT: .LBB2_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB2_30 +; GFX1032-NEXT: .LBB2_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB2_33 +; GFX1032-NEXT: .LBB2_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB2_36 +; GFX1032-NEXT: .LBB2_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB2_39 +; GFX1032-NEXT: .LBB2_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB2_42 +; GFX1032-NEXT: .LBB2_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB2_45 +; GFX1032-NEXT: .LBB2_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB2_48 +; GFX1032-NEXT: .LBB2_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB2_51 +; GFX1032-NEXT: .LBB2_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB2_54 +; GFX1032-NEXT: .LBB2_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB2_57 +; GFX1032-NEXT: .LBB2_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB2_60 +; GFX1032-NEXT: .LBB2_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB2_63 +; GFX1032-NEXT: .LBB2_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB2_66 +; GFX1032-NEXT: .LBB2_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB2_69 +; GFX1032-NEXT: .LBB2_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB2_72 +; GFX1032-NEXT: .LBB2_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB2_75 +; GFX1032-NEXT: .LBB2_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB2_78 +; GFX1032-NEXT: .LBB2_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB2_81 +; GFX1032-NEXT: .LBB2_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB2_84 +; GFX1032-NEXT: .LBB2_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB2_87 +; GFX1032-NEXT: .LBB2_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB2_90 +; GFX1032-NEXT: .LBB2_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_add_i32 s4, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1032-NEXT: s_branch .LBB2_93 +; GFX1032-NEXT: .LBB2_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_93: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_add_i32 s4, s4, s2 +; GFX1032-NEXT: s_and_b32 s2, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s2, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1032-NEXT: s_branch .LBB2_96 +; GFX1032-NEXT: .LBB2_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_96: ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, s6 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_readfirstlane_b32 s7, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s7, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB2_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1032-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_add_i32 s4, s4, s5 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s4, s2 -; GFX1032-NEXT: s_mov_b32 s5, s3 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1032-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: .LBB2_2: +; GFX1032-NEXT: .LBB2_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, s6 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1164-NEXT: s_branch .LBB2_3 +; GFX1164-NEXT: .LBB2_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB2_6 +; GFX1164-NEXT: .LBB2_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB2_9 +; GFX1164-NEXT: .LBB2_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB2_12 +; GFX1164-NEXT: .LBB2_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB2_15 +; GFX1164-NEXT: .LBB2_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB2_18 +; GFX1164-NEXT: .LBB2_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB2_21 +; GFX1164-NEXT: .LBB2_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB2_24 +; GFX1164-NEXT: .LBB2_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB2_27 +; GFX1164-NEXT: .LBB2_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB2_30 +; GFX1164-NEXT: .LBB2_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB2_33 +; GFX1164-NEXT: .LBB2_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB2_36 +; GFX1164-NEXT: .LBB2_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB2_39 +; GFX1164-NEXT: .LBB2_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB2_42 +; GFX1164-NEXT: .LBB2_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB2_45 +; GFX1164-NEXT: .LBB2_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB2_48 +; GFX1164-NEXT: .LBB2_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB2_51 +; GFX1164-NEXT: .LBB2_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB2_54 +; GFX1164-NEXT: .LBB2_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB2_57 +; GFX1164-NEXT: .LBB2_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB2_60 +; GFX1164-NEXT: .LBB2_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB2_63 +; GFX1164-NEXT: .LBB2_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB2_66 +; GFX1164-NEXT: .LBB2_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB2_69 +; GFX1164-NEXT: .LBB2_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB2_72 +; GFX1164-NEXT: .LBB2_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB2_75 +; GFX1164-NEXT: .LBB2_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB2_78 +; GFX1164-NEXT: .LBB2_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB2_81 +; GFX1164-NEXT: .LBB2_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB2_84 +; GFX1164-NEXT: .LBB2_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB2_87 +; GFX1164-NEXT: .LBB2_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB2_90 +; GFX1164-NEXT: .LBB2_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_add_i32 s4, s6, s2 +; GFX1164-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1164-NEXT: s_mov_b32 s7, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB2_93 +; GFX1164-NEXT: .LBB2_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164-NEXT: s_mov_b32 s4, s9 -; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_add_i32 s6, s4, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, 31 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 31 +; GFX1164-NEXT: s_branch .LBB2_96 +; GFX1164-NEXT: .LBB2_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s3, s4, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB2_99 +; GFX1164-NEXT: .LBB2_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB2_102 +; GFX1164-NEXT: .LBB2_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB2_105 +; GFX1164-NEXT: .LBB2_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB2_108 +; GFX1164-NEXT: .LBB2_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB2_111 +; GFX1164-NEXT: .LBB2_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB2_114 +; GFX1164-NEXT: .LBB2_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB2_117 +; GFX1164-NEXT: .LBB2_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB2_120 +; GFX1164-NEXT: .LBB2_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB2_123 +; GFX1164-NEXT: .LBB2_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB2_126 +; GFX1164-NEXT: .LBB2_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB2_129 +; GFX1164-NEXT: .LBB2_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB2_132 +; GFX1164-NEXT: .LBB2_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB2_135 +; GFX1164-NEXT: .LBB2_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB2_138 +; GFX1164-NEXT: .LBB2_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB2_141 +; GFX1164-NEXT: .LBB2_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB2_144 +; GFX1164-NEXT: .LBB2_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB2_147 +; GFX1164-NEXT: .LBB2_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB2_150 +; GFX1164-NEXT: .LBB2_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB2_153 +; GFX1164-NEXT: .LBB2_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB2_156 +; GFX1164-NEXT: .LBB2_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB2_159 +; GFX1164-NEXT: .LBB2_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB2_162 +; GFX1164-NEXT: .LBB2_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB2_165 +; GFX1164-NEXT: .LBB2_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB2_168 +; GFX1164-NEXT: .LBB2_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB2_171 +; GFX1164-NEXT: .LBB2_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB2_174 +; GFX1164-NEXT: .LBB2_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB2_177 +; GFX1164-NEXT: .LBB2_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB2_180 +; GFX1164-NEXT: .LBB2_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB2_183 +; GFX1164-NEXT: .LBB2_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_add_i32 s8, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s6, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s8, 61 +; GFX1164-NEXT: s_branch .LBB2_186 +; GFX1164-NEXT: .LBB2_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s6, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_add_i32 s8, s8, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s6, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s8, 62 +; GFX1164-NEXT: s_branch .LBB2_189 +; GFX1164-NEXT: .LBB2_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s6, 0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_add_i32 s8, s8, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s9, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s8, 63 +; GFX1164-NEXT: s_branch .LBB2_192 +; GFX1164-NEXT: .LBB2_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_192: +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s6, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB2_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s9, 0 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_add_i32 s4, s8, s4 +; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s4, s2 -; GFX1164-NEXT: s_mov_b32 s5, s3 +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: .LBB2_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB2_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, s6 +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1132-NEXT: s_branch .LBB2_3 +; GFX1132-NEXT: .LBB2_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB2_6 +; GFX1132-NEXT: .LBB2_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB2_9 +; GFX1132-NEXT: .LBB2_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB2_12 +; GFX1132-NEXT: .LBB2_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB2_15 +; GFX1132-NEXT: .LBB2_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB2_18 +; GFX1132-NEXT: .LBB2_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB2_21 +; GFX1132-NEXT: .LBB2_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB2_24 +; GFX1132-NEXT: .LBB2_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB2_27 +; GFX1132-NEXT: .LBB2_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB2_30 +; GFX1132-NEXT: .LBB2_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB2_33 +; GFX1132-NEXT: .LBB2_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB2_36 +; GFX1132-NEXT: .LBB2_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB2_39 +; GFX1132-NEXT: .LBB2_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB2_42 +; GFX1132-NEXT: .LBB2_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB2_45 +; GFX1132-NEXT: .LBB2_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB2_48 +; GFX1132-NEXT: .LBB2_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB2_51 +; GFX1132-NEXT: .LBB2_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB2_54 +; GFX1132-NEXT: .LBB2_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB2_57 +; GFX1132-NEXT: .LBB2_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB2_60 +; GFX1132-NEXT: .LBB2_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB2_63 +; GFX1132-NEXT: .LBB2_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB2_66 +; GFX1132-NEXT: .LBB2_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB2_69 +; GFX1132-NEXT: .LBB2_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB2_72 +; GFX1132-NEXT: .LBB2_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB2_75 +; GFX1132-NEXT: .LBB2_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB2_78 +; GFX1132-NEXT: .LBB2_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB2_81 +; GFX1132-NEXT: .LBB2_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB2_84 +; GFX1132-NEXT: .LBB2_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB2_87 +; GFX1132-NEXT: .LBB2_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB2_90 +; GFX1132-NEXT: .LBB2_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_add_i32 s4, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1132-NEXT: s_branch .LBB2_93 +; GFX1132-NEXT: .LBB2_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_93: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_add_i32 s4, s4, s2 +; GFX1132-NEXT: s_and_b32 s2, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s2, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1132-NEXT: s_branch .LBB2_96 +; GFX1132-NEXT: .LBB2_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_96: ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s4, s6 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_readfirstlane_b32 s7, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s7, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB2_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1132-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_add_i32 s4, s4, s5 +; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: v_mov_b32_e32 v0, s4 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s4, s2 -; GFX1132-NEXT: s_mov_b32 s5, s3 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: .LBB2_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB2_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, s6 +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm @@ -2084,368 +6783,5067 @@ ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB8_3 +; GFX8-NEXT: .LBB8_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB8_6 +; GFX8-NEXT: .LBB8_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB8_9 +; GFX8-NEXT: .LBB8_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB8_12 +; GFX8-NEXT: .LBB8_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB8_15 +; GFX8-NEXT: .LBB8_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB8_18 +; GFX8-NEXT: .LBB8_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB8_21 +; GFX8-NEXT: .LBB8_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB8_24 +; GFX8-NEXT: .LBB8_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB8_27 +; GFX8-NEXT: .LBB8_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB8_30 +; GFX8-NEXT: .LBB8_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB8_33 +; GFX8-NEXT: .LBB8_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB8_36 +; GFX8-NEXT: .LBB8_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB8_39 +; GFX8-NEXT: .LBB8_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB8_42 +; GFX8-NEXT: .LBB8_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB8_45 +; GFX8-NEXT: .LBB8_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB8_48 +; GFX8-NEXT: .LBB8_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB8_51 +; GFX8-NEXT: .LBB8_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB8_54 +; GFX8-NEXT: .LBB8_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB8_57 +; GFX8-NEXT: .LBB8_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB8_60 +; GFX8-NEXT: .LBB8_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB8_63 +; GFX8-NEXT: .LBB8_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB8_66 +; GFX8-NEXT: .LBB8_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB8_69 +; GFX8-NEXT: .LBB8_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB8_72 +; GFX8-NEXT: .LBB8_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB8_75 +; GFX8-NEXT: .LBB8_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB8_78 +; GFX8-NEXT: .LBB8_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB8_81 +; GFX8-NEXT: .LBB8_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB8_84 +; GFX8-NEXT: .LBB8_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB8_87 +; GFX8-NEXT: .LBB8_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB8_90 +; GFX8-NEXT: .LBB8_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB8_93 +; GFX8-NEXT: .LBB8_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB8_96 +; GFX8-NEXT: .LBB8_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB8_99 +; GFX8-NEXT: .LBB8_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB8_102 +; GFX8-NEXT: .LBB8_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB8_105 +; GFX8-NEXT: .LBB8_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB8_108 +; GFX8-NEXT: .LBB8_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB8_111 +; GFX8-NEXT: .LBB8_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB8_114 +; GFX8-NEXT: .LBB8_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB8_117 +; GFX8-NEXT: .LBB8_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB8_120 +; GFX8-NEXT: .LBB8_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB8_123 +; GFX8-NEXT: .LBB8_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB8_126 +; GFX8-NEXT: .LBB8_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB8_129 +; GFX8-NEXT: .LBB8_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB8_132 +; GFX8-NEXT: .LBB8_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB8_135 +; GFX8-NEXT: .LBB8_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB8_138 +; GFX8-NEXT: .LBB8_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB8_141 +; GFX8-NEXT: .LBB8_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB8_144 +; GFX8-NEXT: .LBB8_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB8_147 +; GFX8-NEXT: .LBB8_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB8_150 +; GFX8-NEXT: .LBB8_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB8_153 +; GFX8-NEXT: .LBB8_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB8_156 +; GFX8-NEXT: .LBB8_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB8_159 +; GFX8-NEXT: .LBB8_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB8_162 +; GFX8-NEXT: .LBB8_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB8_165 +; GFX8-NEXT: .LBB8_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB8_168 +; GFX8-NEXT: .LBB8_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB8_171 +; GFX8-NEXT: .LBB8_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB8_174 +; GFX8-NEXT: .LBB8_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB8_177 +; GFX8-NEXT: .LBB8_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB8_180 +; GFX8-NEXT: .LBB8_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB8_183 +; GFX8-NEXT: .LBB8_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB8_186 +; GFX8-NEXT: .LBB8_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s8, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s8, 62 +; GFX8-NEXT: s_branch .LBB8_189 +; GFX8-NEXT: .LBB8_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s8, s8, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s9, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB8_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s8, 63 +; GFX8-NEXT: s_branch .LBB8_192 +; GFX8-NEXT: .LBB8_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB8_192: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s6, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: v_readfirstlane_b32 s6, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB8_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_cbranch_execz .LBB8_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s9, 0 +; GFX8-NEXT: s_add_i32 s4, s8, s4 ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 ; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol -; GFX8-NEXT: .LBB8_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: .LBB8_194: +; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB8_3 +; GFX9-NEXT: .LBB8_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB8_6 +; GFX9-NEXT: .LBB8_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB8_9 +; GFX9-NEXT: .LBB8_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB8_12 +; GFX9-NEXT: .LBB8_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB8_15 +; GFX9-NEXT: .LBB8_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB8_18 +; GFX9-NEXT: .LBB8_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB8_21 +; GFX9-NEXT: .LBB8_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB8_24 +; GFX9-NEXT: .LBB8_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB8_27 +; GFX9-NEXT: .LBB8_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB8_30 +; GFX9-NEXT: .LBB8_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB8_33 +; GFX9-NEXT: .LBB8_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB8_36 +; GFX9-NEXT: .LBB8_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB8_39 +; GFX9-NEXT: .LBB8_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB8_42 +; GFX9-NEXT: .LBB8_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB8_45 +; GFX9-NEXT: .LBB8_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB8_48 +; GFX9-NEXT: .LBB8_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB8_51 +; GFX9-NEXT: .LBB8_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB8_54 +; GFX9-NEXT: .LBB8_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB8_57 +; GFX9-NEXT: .LBB8_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB8_60 +; GFX9-NEXT: .LBB8_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB8_63 +; GFX9-NEXT: .LBB8_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB8_66 +; GFX9-NEXT: .LBB8_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB8_69 +; GFX9-NEXT: .LBB8_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB8_72 +; GFX9-NEXT: .LBB8_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB8_75 +; GFX9-NEXT: .LBB8_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB8_78 +; GFX9-NEXT: .LBB8_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB8_81 +; GFX9-NEXT: .LBB8_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB8_84 +; GFX9-NEXT: .LBB8_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB8_87 +; GFX9-NEXT: .LBB8_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB8_90 +; GFX9-NEXT: .LBB8_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB8_93 +; GFX9-NEXT: .LBB8_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB8_96 +; GFX9-NEXT: .LBB8_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB8_99 +; GFX9-NEXT: .LBB8_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB8_102 +; GFX9-NEXT: .LBB8_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB8_105 +; GFX9-NEXT: .LBB8_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB8_108 +; GFX9-NEXT: .LBB8_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB8_111 +; GFX9-NEXT: .LBB8_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB8_114 +; GFX9-NEXT: .LBB8_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB8_117 +; GFX9-NEXT: .LBB8_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB8_120 +; GFX9-NEXT: .LBB8_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB8_123 +; GFX9-NEXT: .LBB8_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB8_126 +; GFX9-NEXT: .LBB8_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB8_129 +; GFX9-NEXT: .LBB8_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB8_132 +; GFX9-NEXT: .LBB8_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB8_135 +; GFX9-NEXT: .LBB8_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB8_138 +; GFX9-NEXT: .LBB8_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB8_141 +; GFX9-NEXT: .LBB8_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB8_144 +; GFX9-NEXT: .LBB8_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB8_147 +; GFX9-NEXT: .LBB8_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB8_150 +; GFX9-NEXT: .LBB8_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB8_153 +; GFX9-NEXT: .LBB8_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB8_156 +; GFX9-NEXT: .LBB8_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB8_159 +; GFX9-NEXT: .LBB8_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB8_162 +; GFX9-NEXT: .LBB8_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB8_165 +; GFX9-NEXT: .LBB8_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB8_168 +; GFX9-NEXT: .LBB8_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB8_171 +; GFX9-NEXT: .LBB8_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB8_174 +; GFX9-NEXT: .LBB8_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB8_177 +; GFX9-NEXT: .LBB8_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB8_180 +; GFX9-NEXT: .LBB8_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB8_183 +; GFX9-NEXT: .LBB8_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB8_186 +; GFX9-NEXT: .LBB8_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s8, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s8, 62 +; GFX9-NEXT: s_branch .LBB8_189 +; GFX9-NEXT: .LBB8_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s8, s8, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s9, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB8_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s8, 63 +; GFX9-NEXT: s_branch .LBB8_192 +; GFX9-NEXT: .LBB8_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB8_192: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s6, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_readfirstlane_b32 s6, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB8_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_cbranch_execz .LBB8_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s9, 0 +; GFX9-NEXT: s_add_i32 s4, s8, s4 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 ; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: .LBB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: .LBB8_194: +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1064-NEXT: s_branch .LBB8_3 +; GFX1064-NEXT: .LBB8_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB8_6 +; GFX1064-NEXT: .LBB8_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB8_9 +; GFX1064-NEXT: .LBB8_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB8_12 +; GFX1064-NEXT: .LBB8_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB8_15 +; GFX1064-NEXT: .LBB8_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB8_18 +; GFX1064-NEXT: .LBB8_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB8_21 +; GFX1064-NEXT: .LBB8_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB8_24 +; GFX1064-NEXT: .LBB8_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB8_27 +; GFX1064-NEXT: .LBB8_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB8_30 +; GFX1064-NEXT: .LBB8_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB8_33 +; GFX1064-NEXT: .LBB8_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB8_36 +; GFX1064-NEXT: .LBB8_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB8_39 +; GFX1064-NEXT: .LBB8_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB8_42 +; GFX1064-NEXT: .LBB8_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB8_45 +; GFX1064-NEXT: .LBB8_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB8_48 +; GFX1064-NEXT: .LBB8_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB8_51 +; GFX1064-NEXT: .LBB8_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB8_54 +; GFX1064-NEXT: .LBB8_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB8_57 +; GFX1064-NEXT: .LBB8_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB8_60 +; GFX1064-NEXT: .LBB8_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB8_63 +; GFX1064-NEXT: .LBB8_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB8_66 +; GFX1064-NEXT: .LBB8_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB8_69 +; GFX1064-NEXT: .LBB8_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB8_72 +; GFX1064-NEXT: .LBB8_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB8_75 +; GFX1064-NEXT: .LBB8_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB8_78 +; GFX1064-NEXT: .LBB8_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB8_81 +; GFX1064-NEXT: .LBB8_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB8_84 +; GFX1064-NEXT: .LBB8_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB8_87 +; GFX1064-NEXT: .LBB8_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB8_90 +; GFX1064-NEXT: .LBB8_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_add_i32 s4, s6, s2 +; GFX1064-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB8_93 +; GFX1064-NEXT: .LBB8_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0 +; GFX1064-NEXT: s_add_i32 s6, s4, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, 31 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 31 +; GFX1064-NEXT: s_branch .LBB8_96 +; GFX1064-NEXT: .LBB8_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s3, s4, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB8_99 +; GFX1064-NEXT: .LBB8_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB8_102 +; GFX1064-NEXT: .LBB8_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB8_105 +; GFX1064-NEXT: .LBB8_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB8_108 +; GFX1064-NEXT: .LBB8_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB8_111 +; GFX1064-NEXT: .LBB8_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB8_114 +; GFX1064-NEXT: .LBB8_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB8_117 +; GFX1064-NEXT: .LBB8_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB8_120 +; GFX1064-NEXT: .LBB8_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB8_123 +; GFX1064-NEXT: .LBB8_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB8_126 +; GFX1064-NEXT: .LBB8_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB8_129 +; GFX1064-NEXT: .LBB8_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB8_132 +; GFX1064-NEXT: .LBB8_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB8_135 +; GFX1064-NEXT: .LBB8_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB8_138 +; GFX1064-NEXT: .LBB8_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB8_141 +; GFX1064-NEXT: .LBB8_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB8_144 +; GFX1064-NEXT: .LBB8_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB8_147 +; GFX1064-NEXT: .LBB8_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB8_150 +; GFX1064-NEXT: .LBB8_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB8_153 +; GFX1064-NEXT: .LBB8_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB8_156 +; GFX1064-NEXT: .LBB8_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB8_159 +; GFX1064-NEXT: .LBB8_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB8_162 +; GFX1064-NEXT: .LBB8_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB8_165 +; GFX1064-NEXT: .LBB8_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB8_168 +; GFX1064-NEXT: .LBB8_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB8_171 +; GFX1064-NEXT: .LBB8_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB8_174 +; GFX1064-NEXT: .LBB8_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB8_177 +; GFX1064-NEXT: .LBB8_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB8_180 +; GFX1064-NEXT: .LBB8_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB8_183 +; GFX1064-NEXT: .LBB8_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_add_i32 s8, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s6, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s8, 61 +; GFX1064-NEXT: s_branch .LBB8_186 +; GFX1064-NEXT: .LBB8_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s6, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_add_i32 s8, s8, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s6, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s8, 62 +; GFX1064-NEXT: s_branch .LBB8_189 +; GFX1064-NEXT: .LBB8_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_add_i32 s8, s8, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s9, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB8_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s8, 63 +; GFX1064-NEXT: s_branch .LBB8_192 +; GFX1064-NEXT: .LBB8_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB8_192: ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064-NEXT: s_mov_b32 s4, s9 -; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[6:7] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_readfirstlane_b32 s6, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB8_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB8_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s9, 0 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_add_i32 s4, s8, s4 +; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: v_mov_b32_e32 v0, s4 -; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s4, s2 -; GFX1064-NEXT: s_mov_b32 s5, s3 +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1064-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv -; GFX1064-NEXT: .LBB8_2: +; GFX1064-NEXT: .LBB8_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, s6 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1032-NEXT: s_branch .LBB8_3 +; GFX1032-NEXT: .LBB8_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB8_6 +; GFX1032-NEXT: .LBB8_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB8_9 +; GFX1032-NEXT: .LBB8_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB8_12 +; GFX1032-NEXT: .LBB8_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB8_15 +; GFX1032-NEXT: .LBB8_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB8_18 +; GFX1032-NEXT: .LBB8_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB8_21 +; GFX1032-NEXT: .LBB8_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB8_24 +; GFX1032-NEXT: .LBB8_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB8_27 +; GFX1032-NEXT: .LBB8_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB8_30 +; GFX1032-NEXT: .LBB8_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB8_33 +; GFX1032-NEXT: .LBB8_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB8_36 +; GFX1032-NEXT: .LBB8_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB8_39 +; GFX1032-NEXT: .LBB8_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB8_42 +; GFX1032-NEXT: .LBB8_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB8_45 +; GFX1032-NEXT: .LBB8_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB8_48 +; GFX1032-NEXT: .LBB8_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB8_51 +; GFX1032-NEXT: .LBB8_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB8_54 +; GFX1032-NEXT: .LBB8_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB8_57 +; GFX1032-NEXT: .LBB8_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB8_60 +; GFX1032-NEXT: .LBB8_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB8_63 +; GFX1032-NEXT: .LBB8_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB8_66 +; GFX1032-NEXT: .LBB8_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB8_69 +; GFX1032-NEXT: .LBB8_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB8_72 +; GFX1032-NEXT: .LBB8_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB8_75 +; GFX1032-NEXT: .LBB8_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB8_78 +; GFX1032-NEXT: .LBB8_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB8_81 +; GFX1032-NEXT: .LBB8_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB8_84 +; GFX1032-NEXT: .LBB8_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB8_87 +; GFX1032-NEXT: .LBB8_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB8_90 +; GFX1032-NEXT: .LBB8_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_add_i32 s4, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1032-NEXT: s_branch .LBB8_93 +; GFX1032-NEXT: .LBB8_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_93: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_add_i32 s4, s4, s2 +; GFX1032-NEXT: s_and_b32 s2, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s2, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB8_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1032-NEXT: s_branch .LBB8_96 +; GFX1032-NEXT: .LBB8_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB8_96: ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s4 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s4, s6 -; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_readfirstlane_b32 s7, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB8_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s7, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB8_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1032-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_add_i32 s4, s4, s5 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: v_mov_b32_e32 v0, s4 -; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s4, s2 -; GFX1032-NEXT: s_mov_b32 s5, s3 +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1032-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv -; GFX1032-NEXT: .LBB8_2: +; GFX1032-NEXT: .LBB8_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, s6 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_readlane_b32 s6, v1, 15 -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_readlane_b32 s8, v1, 47 -; GFX1164-NEXT: v_readlane_b32 s9, v1, 63 -; GFX1164-NEXT: v_writelane_b32 v3, s7, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] +; GFX1164-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1164-NEXT: s_branch .LBB8_3 +; GFX1164-NEXT: .LBB8_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB8_6 +; GFX1164-NEXT: .LBB8_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB8_9 +; GFX1164-NEXT: .LBB8_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB8_12 +; GFX1164-NEXT: .LBB8_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB8_15 +; GFX1164-NEXT: .LBB8_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB8_18 +; GFX1164-NEXT: .LBB8_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB8_21 +; GFX1164-NEXT: .LBB8_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB8_24 +; GFX1164-NEXT: .LBB8_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB8_27 +; GFX1164-NEXT: .LBB8_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB8_30 +; GFX1164-NEXT: .LBB8_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB8_33 +; GFX1164-NEXT: .LBB8_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB8_36 +; GFX1164-NEXT: .LBB8_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB8_39 +; GFX1164-NEXT: .LBB8_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB8_42 +; GFX1164-NEXT: .LBB8_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB8_45 +; GFX1164-NEXT: .LBB8_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB8_48 +; GFX1164-NEXT: .LBB8_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB8_51 +; GFX1164-NEXT: .LBB8_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB8_54 +; GFX1164-NEXT: .LBB8_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB8_57 +; GFX1164-NEXT: .LBB8_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB8_60 +; GFX1164-NEXT: .LBB8_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB8_63 +; GFX1164-NEXT: .LBB8_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB8_66 +; GFX1164-NEXT: .LBB8_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB8_69 +; GFX1164-NEXT: .LBB8_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB8_72 +; GFX1164-NEXT: .LBB8_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB8_75 +; GFX1164-NEXT: .LBB8_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB8_78 +; GFX1164-NEXT: .LBB8_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB8_81 +; GFX1164-NEXT: .LBB8_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB8_84 +; GFX1164-NEXT: .LBB8_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB8_87 +; GFX1164-NEXT: .LBB8_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB8_90 +; GFX1164-NEXT: .LBB8_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_add_i32 s4, s6, s2 +; GFX1164-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1164-NEXT: s_mov_b32 s7, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB8_93 +; GFX1164-NEXT: .LBB8_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1164-NEXT: s_mov_b32 s4, s9 -; GFX1164-NEXT: v_writelane_b32 v3, s8, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[6:7] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s6, -1 +; GFX1164-NEXT: s_add_i32 s6, s4, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, 31 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 31 +; GFX1164-NEXT: s_branch .LBB8_96 +; GFX1164-NEXT: .LBB8_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s3, s4, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB8_99 +; GFX1164-NEXT: .LBB8_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB8_102 +; GFX1164-NEXT: .LBB8_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB8_105 +; GFX1164-NEXT: .LBB8_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB8_108 +; GFX1164-NEXT: .LBB8_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB8_111 +; GFX1164-NEXT: .LBB8_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB8_114 +; GFX1164-NEXT: .LBB8_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB8_117 +; GFX1164-NEXT: .LBB8_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB8_120 +; GFX1164-NEXT: .LBB8_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB8_123 +; GFX1164-NEXT: .LBB8_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB8_126 +; GFX1164-NEXT: .LBB8_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB8_129 +; GFX1164-NEXT: .LBB8_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB8_132 +; GFX1164-NEXT: .LBB8_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB8_135 +; GFX1164-NEXT: .LBB8_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB8_138 +; GFX1164-NEXT: .LBB8_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB8_141 +; GFX1164-NEXT: .LBB8_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB8_144 +; GFX1164-NEXT: .LBB8_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB8_147 +; GFX1164-NEXT: .LBB8_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB8_150 +; GFX1164-NEXT: .LBB8_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB8_153 +; GFX1164-NEXT: .LBB8_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB8_156 +; GFX1164-NEXT: .LBB8_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB8_159 +; GFX1164-NEXT: .LBB8_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB8_162 +; GFX1164-NEXT: .LBB8_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB8_165 +; GFX1164-NEXT: .LBB8_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB8_168 +; GFX1164-NEXT: .LBB8_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB8_171 +; GFX1164-NEXT: .LBB8_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB8_174 +; GFX1164-NEXT: .LBB8_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB8_177 +; GFX1164-NEXT: .LBB8_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB8_180 +; GFX1164-NEXT: .LBB8_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB8_183 +; GFX1164-NEXT: .LBB8_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_add_i32 s8, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s6, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s8, 61 +; GFX1164-NEXT: s_branch .LBB8_186 +; GFX1164-NEXT: .LBB8_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s6, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_add_i32 s8, s8, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s6, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s8, 62 +; GFX1164-NEXT: s_branch .LBB8_189 +; GFX1164-NEXT: .LBB8_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s6, 0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_add_i32 s8, s8, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s9, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB8_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s8, 63 +; GFX1164-NEXT: s_branch .LBB8_192 +; GFX1164-NEXT: .LBB8_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB8_192: +; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1164-NEXT: v_readfirstlane_b32 s6, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB8_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB8_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s9, 0 +; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-NEXT: s_add_i32 s4, s8, s4 +; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: v_mov_b32_e32 v0, s4 -; GFX1164-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-NEXT: s_mov_b32 s4, s2 -; GFX1164-NEXT: s_mov_b32 s5, s3 +; GFX1164-NEXT: s_mov_b32 s8, s2 +; GFX1164-NEXT: s_mov_b32 s9, s3 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: buffer_gl1_inv -; GFX1164-NEXT: .LBB8_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1164-NEXT: .LBB8_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1164-NEXT: s_mov_b32 s2, s6 +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1132-NEXT: s_branch .LBB8_3 +; GFX1132-NEXT: .LBB8_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB8_6 +; GFX1132-NEXT: .LBB8_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB8_9 +; GFX1132-NEXT: .LBB8_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB8_12 +; GFX1132-NEXT: .LBB8_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB8_15 +; GFX1132-NEXT: .LBB8_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB8_18 +; GFX1132-NEXT: .LBB8_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB8_21 +; GFX1132-NEXT: .LBB8_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB8_24 +; GFX1132-NEXT: .LBB8_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB8_27 +; GFX1132-NEXT: .LBB8_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB8_30 +; GFX1132-NEXT: .LBB8_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB8_33 +; GFX1132-NEXT: .LBB8_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB8_36 +; GFX1132-NEXT: .LBB8_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB8_39 +; GFX1132-NEXT: .LBB8_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB8_42 +; GFX1132-NEXT: .LBB8_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB8_45 +; GFX1132-NEXT: .LBB8_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB8_48 +; GFX1132-NEXT: .LBB8_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB8_51 +; GFX1132-NEXT: .LBB8_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB8_54 +; GFX1132-NEXT: .LBB8_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB8_57 +; GFX1132-NEXT: .LBB8_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB8_60 +; GFX1132-NEXT: .LBB8_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB8_63 +; GFX1132-NEXT: .LBB8_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB8_66 +; GFX1132-NEXT: .LBB8_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB8_69 +; GFX1132-NEXT: .LBB8_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB8_72 +; GFX1132-NEXT: .LBB8_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB8_75 +; GFX1132-NEXT: .LBB8_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB8_78 +; GFX1132-NEXT: .LBB8_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB8_81 +; GFX1132-NEXT: .LBB8_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB8_84 +; GFX1132-NEXT: .LBB8_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB8_87 +; GFX1132-NEXT: .LBB8_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB8_90 +; GFX1132-NEXT: .LBB8_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_add_i32 s4, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1132-NEXT: s_branch .LBB8_93 +; GFX1132-NEXT: .LBB8_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_93: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_add_i32 s4, s4, s2 +; GFX1132-NEXT: s_and_b32 s2, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s2, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB8_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1132-NEXT: s_branch .LBB8_96 +; GFX1132-NEXT: .LBB8_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB8_96: ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_readlane_b32 s5, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s6, v1, 31 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s4, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s5, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s4 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s4, s6 -; GFX1132-NEXT: s_mov_b32 s6, -1 +; GFX1132-NEXT: v_readfirstlane_b32 s7, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB8_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s7, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s7, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB8_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1132-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-NEXT: s_add_i32 s4, s4, s5 +; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: v_mov_b32_e32 v0, s4 -; GFX1132-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-NEXT: s_mov_b32 s4, s2 -; GFX1132-NEXT: s_mov_b32 s5, s3 +; GFX1132-NEXT: s_mov_b32 s8, s2 +; GFX1132-NEXT: s_mov_b32 s9, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc +; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: buffer_gl1_inv -; GFX1132-NEXT: .LBB8_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1132-NEXT: .LBB8_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s7 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX1132-NEXT: s_mov_b32 s2, s6 +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s @@ -491,269 +491,4528 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB2_3 +; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB2_6 +; GFX8-NEXT: .LBB2_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB2_9 +; GFX8-NEXT: .LBB2_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB2_12 +; GFX8-NEXT: .LBB2_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB2_15 +; GFX8-NEXT: .LBB2_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB2_18 +; GFX8-NEXT: .LBB2_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB2_21 +; GFX8-NEXT: .LBB2_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB2_24 +; GFX8-NEXT: .LBB2_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB2_27 +; GFX8-NEXT: .LBB2_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB2_30 +; GFX8-NEXT: .LBB2_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB2_33 +; GFX8-NEXT: .LBB2_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB2_36 +; GFX8-NEXT: .LBB2_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB2_39 +; GFX8-NEXT: .LBB2_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB2_42 +; GFX8-NEXT: .LBB2_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB2_45 +; GFX8-NEXT: .LBB2_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB2_48 +; GFX8-NEXT: .LBB2_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB2_51 +; GFX8-NEXT: .LBB2_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB2_54 +; GFX8-NEXT: .LBB2_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB2_57 +; GFX8-NEXT: .LBB2_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB2_60 +; GFX8-NEXT: .LBB2_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB2_63 +; GFX8-NEXT: .LBB2_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB2_66 +; GFX8-NEXT: .LBB2_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB2_69 +; GFX8-NEXT: .LBB2_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB2_72 +; GFX8-NEXT: .LBB2_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB2_75 +; GFX8-NEXT: .LBB2_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB2_78 +; GFX8-NEXT: .LBB2_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB2_81 +; GFX8-NEXT: .LBB2_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB2_84 +; GFX8-NEXT: .LBB2_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB2_87 +; GFX8-NEXT: .LBB2_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB2_90 +; GFX8-NEXT: .LBB2_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB2_93 +; GFX8-NEXT: .LBB2_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB2_96 +; GFX8-NEXT: .LBB2_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB2_99 +; GFX8-NEXT: .LBB2_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB2_102 +; GFX8-NEXT: .LBB2_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB2_105 +; GFX8-NEXT: .LBB2_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB2_108 +; GFX8-NEXT: .LBB2_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB2_111 +; GFX8-NEXT: .LBB2_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB2_114 +; GFX8-NEXT: .LBB2_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB2_117 +; GFX8-NEXT: .LBB2_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB2_120 +; GFX8-NEXT: .LBB2_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB2_123 +; GFX8-NEXT: .LBB2_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB2_126 +; GFX8-NEXT: .LBB2_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB2_129 +; GFX8-NEXT: .LBB2_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB2_132 +; GFX8-NEXT: .LBB2_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB2_135 +; GFX8-NEXT: .LBB2_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB2_138 +; GFX8-NEXT: .LBB2_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB2_141 +; GFX8-NEXT: .LBB2_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB2_144 +; GFX8-NEXT: .LBB2_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB2_147 +; GFX8-NEXT: .LBB2_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB2_150 +; GFX8-NEXT: .LBB2_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB2_153 +; GFX8-NEXT: .LBB2_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB2_156 +; GFX8-NEXT: .LBB2_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB2_159 +; GFX8-NEXT: .LBB2_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB2_162 +; GFX8-NEXT: .LBB2_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB2_165 +; GFX8-NEXT: .LBB2_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB2_168 +; GFX8-NEXT: .LBB2_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB2_171 +; GFX8-NEXT: .LBB2_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB2_174 +; GFX8-NEXT: .LBB2_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB2_177 +; GFX8-NEXT: .LBB2_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB2_180 +; GFX8-NEXT: .LBB2_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB2_183 +; GFX8-NEXT: .LBB2_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB2_186 +; GFX8-NEXT: .LBB2_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB2_189 +; GFX8-NEXT: .LBB2_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB2_192 +; GFX8-NEXT: .LBB2_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_cbranch_execz .LBB2_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB2_3 +; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB2_6 +; GFX9-NEXT: .LBB2_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB2_9 +; GFX9-NEXT: .LBB2_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB2_12 +; GFX9-NEXT: .LBB2_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB2_15 +; GFX9-NEXT: .LBB2_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB2_18 +; GFX9-NEXT: .LBB2_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB2_21 +; GFX9-NEXT: .LBB2_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB2_24 +; GFX9-NEXT: .LBB2_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB2_27 +; GFX9-NEXT: .LBB2_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB2_30 +; GFX9-NEXT: .LBB2_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB2_33 +; GFX9-NEXT: .LBB2_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB2_36 +; GFX9-NEXT: .LBB2_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB2_39 +; GFX9-NEXT: .LBB2_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB2_42 +; GFX9-NEXT: .LBB2_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB2_45 +; GFX9-NEXT: .LBB2_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB2_48 +; GFX9-NEXT: .LBB2_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB2_51 +; GFX9-NEXT: .LBB2_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB2_54 +; GFX9-NEXT: .LBB2_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB2_57 +; GFX9-NEXT: .LBB2_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB2_60 +; GFX9-NEXT: .LBB2_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB2_63 +; GFX9-NEXT: .LBB2_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB2_66 +; GFX9-NEXT: .LBB2_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB2_69 +; GFX9-NEXT: .LBB2_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB2_72 +; GFX9-NEXT: .LBB2_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB2_75 +; GFX9-NEXT: .LBB2_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB2_78 +; GFX9-NEXT: .LBB2_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB2_81 +; GFX9-NEXT: .LBB2_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB2_84 +; GFX9-NEXT: .LBB2_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB2_87 +; GFX9-NEXT: .LBB2_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB2_90 +; GFX9-NEXT: .LBB2_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB2_93 +; GFX9-NEXT: .LBB2_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB2_96 +; GFX9-NEXT: .LBB2_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB2_99 +; GFX9-NEXT: .LBB2_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB2_102 +; GFX9-NEXT: .LBB2_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB2_105 +; GFX9-NEXT: .LBB2_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB2_108 +; GFX9-NEXT: .LBB2_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB2_111 +; GFX9-NEXT: .LBB2_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB2_114 +; GFX9-NEXT: .LBB2_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB2_117 +; GFX9-NEXT: .LBB2_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB2_120 +; GFX9-NEXT: .LBB2_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB2_123 +; GFX9-NEXT: .LBB2_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB2_126 +; GFX9-NEXT: .LBB2_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB2_129 +; GFX9-NEXT: .LBB2_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB2_132 +; GFX9-NEXT: .LBB2_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB2_135 +; GFX9-NEXT: .LBB2_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB2_138 +; GFX9-NEXT: .LBB2_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB2_141 +; GFX9-NEXT: .LBB2_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB2_144 +; GFX9-NEXT: .LBB2_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB2_147 +; GFX9-NEXT: .LBB2_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB2_150 +; GFX9-NEXT: .LBB2_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB2_153 +; GFX9-NEXT: .LBB2_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB2_156 +; GFX9-NEXT: .LBB2_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB2_159 +; GFX9-NEXT: .LBB2_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB2_162 +; GFX9-NEXT: .LBB2_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB2_165 +; GFX9-NEXT: .LBB2_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB2_168 +; GFX9-NEXT: .LBB2_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB2_171 +; GFX9-NEXT: .LBB2_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB2_174 +; GFX9-NEXT: .LBB2_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB2_177 +; GFX9-NEXT: .LBB2_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB2_180 +; GFX9-NEXT: .LBB2_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB2_183 +; GFX9-NEXT: .LBB2_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB2_186 +; GFX9-NEXT: .LBB2_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB2_189 +; GFX9-NEXT: .LBB2_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB2_192 +; GFX9-NEXT: .LBB2_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_cbranch_execz .LBB2_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1064-NEXT: s_branch .LBB2_3 +; GFX1064-NEXT: .LBB2_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB2_6 +; GFX1064-NEXT: .LBB2_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB2_9 +; GFX1064-NEXT: .LBB2_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB2_12 +; GFX1064-NEXT: .LBB2_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB2_15 +; GFX1064-NEXT: .LBB2_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB2_18 +; GFX1064-NEXT: .LBB2_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB2_21 +; GFX1064-NEXT: .LBB2_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB2_24 +; GFX1064-NEXT: .LBB2_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB2_27 +; GFX1064-NEXT: .LBB2_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB2_30 +; GFX1064-NEXT: .LBB2_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB2_33 +; GFX1064-NEXT: .LBB2_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB2_36 +; GFX1064-NEXT: .LBB2_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB2_39 +; GFX1064-NEXT: .LBB2_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB2_42 +; GFX1064-NEXT: .LBB2_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB2_45 +; GFX1064-NEXT: .LBB2_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB2_48 +; GFX1064-NEXT: .LBB2_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB2_51 +; GFX1064-NEXT: .LBB2_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB2_54 +; GFX1064-NEXT: .LBB2_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB2_57 +; GFX1064-NEXT: .LBB2_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB2_60 +; GFX1064-NEXT: .LBB2_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB2_63 +; GFX1064-NEXT: .LBB2_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB2_66 +; GFX1064-NEXT: .LBB2_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB2_69 +; GFX1064-NEXT: .LBB2_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB2_72 +; GFX1064-NEXT: .LBB2_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB2_75 +; GFX1064-NEXT: .LBB2_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB2_78 +; GFX1064-NEXT: .LBB2_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB2_81 +; GFX1064-NEXT: .LBB2_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB2_84 +; GFX1064-NEXT: .LBB2_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB2_87 +; GFX1064-NEXT: .LBB2_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB2_90 +; GFX1064-NEXT: .LBB2_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_add_i32 s4, s6, s2 +; GFX1064-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB2_93 +; GFX1064-NEXT: .LBB2_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0 +; GFX1064-NEXT: s_add_i32 s6, s4, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, 31 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 31 +; GFX1064-NEXT: s_branch .LBB2_96 +; GFX1064-NEXT: .LBB2_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s3, s4, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB2_99 +; GFX1064-NEXT: .LBB2_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB2_102 +; GFX1064-NEXT: .LBB2_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB2_105 +; GFX1064-NEXT: .LBB2_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB2_108 +; GFX1064-NEXT: .LBB2_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB2_111 +; GFX1064-NEXT: .LBB2_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB2_114 +; GFX1064-NEXT: .LBB2_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB2_117 +; GFX1064-NEXT: .LBB2_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB2_120 +; GFX1064-NEXT: .LBB2_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB2_123 +; GFX1064-NEXT: .LBB2_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB2_126 +; GFX1064-NEXT: .LBB2_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB2_129 +; GFX1064-NEXT: .LBB2_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB2_132 +; GFX1064-NEXT: .LBB2_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB2_135 +; GFX1064-NEXT: .LBB2_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB2_138 +; GFX1064-NEXT: .LBB2_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB2_141 +; GFX1064-NEXT: .LBB2_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB2_144 +; GFX1064-NEXT: .LBB2_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB2_147 +; GFX1064-NEXT: .LBB2_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB2_150 +; GFX1064-NEXT: .LBB2_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB2_153 +; GFX1064-NEXT: .LBB2_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB2_156 +; GFX1064-NEXT: .LBB2_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB2_159 +; GFX1064-NEXT: .LBB2_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB2_162 +; GFX1064-NEXT: .LBB2_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB2_165 +; GFX1064-NEXT: .LBB2_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB2_168 +; GFX1064-NEXT: .LBB2_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB2_171 +; GFX1064-NEXT: .LBB2_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB2_174 +; GFX1064-NEXT: .LBB2_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB2_177 +; GFX1064-NEXT: .LBB2_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB2_180 +; GFX1064-NEXT: .LBB2_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB2_183 +; GFX1064-NEXT: .LBB2_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1064-NEXT: s_branch .LBB2_186 +; GFX1064-NEXT: .LBB2_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1064-NEXT: s_branch .LBB2_189 +; GFX1064-NEXT: .LBB2_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1064-NEXT: s_branch .LBB2_192 +; GFX1064-NEXT: .LBB2_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB2_192: +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB2_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB2_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: s_add_i32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB2_2: +; GFX1064-NEXT: .LBB2_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1032-NEXT: s_branch .LBB2_3 +; GFX1032-NEXT: .LBB2_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB2_6 +; GFX1032-NEXT: .LBB2_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB2_9 +; GFX1032-NEXT: .LBB2_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB2_12 +; GFX1032-NEXT: .LBB2_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB2_15 +; GFX1032-NEXT: .LBB2_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB2_18 +; GFX1032-NEXT: .LBB2_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB2_21 +; GFX1032-NEXT: .LBB2_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB2_24 +; GFX1032-NEXT: .LBB2_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB2_27 +; GFX1032-NEXT: .LBB2_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB2_30 +; GFX1032-NEXT: .LBB2_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB2_33 +; GFX1032-NEXT: .LBB2_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB2_36 +; GFX1032-NEXT: .LBB2_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB2_39 +; GFX1032-NEXT: .LBB2_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB2_42 +; GFX1032-NEXT: .LBB2_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB2_45 +; GFX1032-NEXT: .LBB2_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB2_48 +; GFX1032-NEXT: .LBB2_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB2_51 +; GFX1032-NEXT: .LBB2_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB2_54 +; GFX1032-NEXT: .LBB2_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB2_57 +; GFX1032-NEXT: .LBB2_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB2_60 +; GFX1032-NEXT: .LBB2_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB2_63 +; GFX1032-NEXT: .LBB2_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB2_66 +; GFX1032-NEXT: .LBB2_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB2_69 +; GFX1032-NEXT: .LBB2_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB2_72 +; GFX1032-NEXT: .LBB2_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB2_75 +; GFX1032-NEXT: .LBB2_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB2_78 +; GFX1032-NEXT: .LBB2_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB2_81 +; GFX1032-NEXT: .LBB2_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB2_84 +; GFX1032-NEXT: .LBB2_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB2_87 +; GFX1032-NEXT: .LBB2_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB2_90 +; GFX1032-NEXT: .LBB2_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1032-NEXT: s_branch .LBB2_93 +; GFX1032-NEXT: .LBB2_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_93: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1032-NEXT: s_branch .LBB2_96 +; GFX1032-NEXT: .LBB2_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB2_96: +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB2_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB2_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB2_2: +; GFX1032-NEXT: .LBB2_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1164-NEXT: s_branch .LBB2_3 +; GFX1164-NEXT: .LBB2_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB2_6 +; GFX1164-NEXT: .LBB2_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB2_9 +; GFX1164-NEXT: .LBB2_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB2_12 +; GFX1164-NEXT: .LBB2_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB2_15 +; GFX1164-NEXT: .LBB2_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB2_18 +; GFX1164-NEXT: .LBB2_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB2_21 +; GFX1164-NEXT: .LBB2_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB2_24 +; GFX1164-NEXT: .LBB2_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB2_27 +; GFX1164-NEXT: .LBB2_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB2_30 +; GFX1164-NEXT: .LBB2_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB2_33 +; GFX1164-NEXT: .LBB2_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB2_36 +; GFX1164-NEXT: .LBB2_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB2_39 +; GFX1164-NEXT: .LBB2_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB2_42 +; GFX1164-NEXT: .LBB2_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB2_45 +; GFX1164-NEXT: .LBB2_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB2_48 +; GFX1164-NEXT: .LBB2_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB2_51 +; GFX1164-NEXT: .LBB2_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB2_54 +; GFX1164-NEXT: .LBB2_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB2_57 +; GFX1164-NEXT: .LBB2_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB2_60 +; GFX1164-NEXT: .LBB2_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB2_63 +; GFX1164-NEXT: .LBB2_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB2_66 +; GFX1164-NEXT: .LBB2_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB2_69 +; GFX1164-NEXT: .LBB2_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB2_72 +; GFX1164-NEXT: .LBB2_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB2_75 +; GFX1164-NEXT: .LBB2_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB2_78 +; GFX1164-NEXT: .LBB2_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB2_81 +; GFX1164-NEXT: .LBB2_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB2_84 +; GFX1164-NEXT: .LBB2_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB2_87 +; GFX1164-NEXT: .LBB2_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB2_90 +; GFX1164-NEXT: .LBB2_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_add_i32 s4, s6, s2 +; GFX1164-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1164-NEXT: s_mov_b32 s7, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB2_93 +; GFX1164-NEXT: .LBB2_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: s_add_i32 s6, s4, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, 31 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 31 +; GFX1164-NEXT: s_branch .LBB2_96 +; GFX1164-NEXT: .LBB2_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s3, s4, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB2_99 +; GFX1164-NEXT: .LBB2_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB2_102 +; GFX1164-NEXT: .LBB2_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB2_105 +; GFX1164-NEXT: .LBB2_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB2_108 +; GFX1164-NEXT: .LBB2_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB2_111 +; GFX1164-NEXT: .LBB2_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB2_114 +; GFX1164-NEXT: .LBB2_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB2_117 +; GFX1164-NEXT: .LBB2_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB2_120 +; GFX1164-NEXT: .LBB2_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB2_123 +; GFX1164-NEXT: .LBB2_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB2_126 +; GFX1164-NEXT: .LBB2_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB2_129 +; GFX1164-NEXT: .LBB2_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB2_132 +; GFX1164-NEXT: .LBB2_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB2_135 +; GFX1164-NEXT: .LBB2_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB2_138 +; GFX1164-NEXT: .LBB2_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB2_141 +; GFX1164-NEXT: .LBB2_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB2_144 +; GFX1164-NEXT: .LBB2_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB2_147 +; GFX1164-NEXT: .LBB2_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB2_150 +; GFX1164-NEXT: .LBB2_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB2_153 +; GFX1164-NEXT: .LBB2_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB2_156 +; GFX1164-NEXT: .LBB2_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB2_159 +; GFX1164-NEXT: .LBB2_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB2_162 +; GFX1164-NEXT: .LBB2_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB2_165 +; GFX1164-NEXT: .LBB2_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB2_168 +; GFX1164-NEXT: .LBB2_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB2_171 +; GFX1164-NEXT: .LBB2_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB2_174 +; GFX1164-NEXT: .LBB2_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB2_177 +; GFX1164-NEXT: .LBB2_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB2_180 +; GFX1164-NEXT: .LBB2_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB2_183 +; GFX1164-NEXT: .LBB2_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1164-NEXT: s_branch .LBB2_186 +; GFX1164-NEXT: .LBB2_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1164-NEXT: s_branch .LBB2_189 +; GFX1164-NEXT: .LBB2_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1164-NEXT: s_branch .LBB2_192 +; GFX1164-NEXT: .LBB2_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB2_192: +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB2_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB2_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_add_i32 s4, s6, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB2_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB2_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -761,53 +5020,510 @@ ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1132-NEXT: s_branch .LBB2_3 +; GFX1132-NEXT: .LBB2_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB2_6 +; GFX1132-NEXT: .LBB2_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB2_9 +; GFX1132-NEXT: .LBB2_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB2_12 +; GFX1132-NEXT: .LBB2_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB2_15 +; GFX1132-NEXT: .LBB2_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB2_18 +; GFX1132-NEXT: .LBB2_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB2_21 +; GFX1132-NEXT: .LBB2_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB2_24 +; GFX1132-NEXT: .LBB2_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB2_27 +; GFX1132-NEXT: .LBB2_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB2_30 +; GFX1132-NEXT: .LBB2_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB2_33 +; GFX1132-NEXT: .LBB2_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB2_36 +; GFX1132-NEXT: .LBB2_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB2_39 +; GFX1132-NEXT: .LBB2_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB2_42 +; GFX1132-NEXT: .LBB2_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB2_45 +; GFX1132-NEXT: .LBB2_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB2_48 +; GFX1132-NEXT: .LBB2_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB2_51 +; GFX1132-NEXT: .LBB2_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB2_54 +; GFX1132-NEXT: .LBB2_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB2_57 +; GFX1132-NEXT: .LBB2_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB2_60 +; GFX1132-NEXT: .LBB2_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB2_63 +; GFX1132-NEXT: .LBB2_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB2_66 +; GFX1132-NEXT: .LBB2_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB2_69 +; GFX1132-NEXT: .LBB2_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB2_72 +; GFX1132-NEXT: .LBB2_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB2_75 +; GFX1132-NEXT: .LBB2_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB2_78 +; GFX1132-NEXT: .LBB2_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB2_81 +; GFX1132-NEXT: .LBB2_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB2_84 +; GFX1132-NEXT: .LBB2_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB2_87 +; GFX1132-NEXT: .LBB2_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB2_90 +; GFX1132-NEXT: .LBB2_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1132-NEXT: s_branch .LBB2_93 +; GFX1132-NEXT: .LBB2_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_93: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1132-NEXT: s_branch .LBB2_96 +; GFX1132-NEXT: .LBB2_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB2_96: +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB2_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB2_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_add_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB2_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB2_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -831,106 +5547,821 @@ ; ; GFX8-LABEL: add_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v1, 63 -; GFX8-NEXT: s_mov_b64 exec, s[0:1] -; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s67, v1 +; GFX8-NEXT: v_readlane_b32 s66, v0, 0 +; GFX8-NEXT: v_readlane_b32 s65, v0, 1 +; GFX8-NEXT: v_readlane_b32 s64, v0, 2 +; GFX8-NEXT: v_readlane_b32 s63, v0, 3 +; GFX8-NEXT: v_readlane_b32 s62, v0, 4 +; GFX8-NEXT: v_readlane_b32 s61, v0, 5 +; GFX8-NEXT: v_readlane_b32 s60, v0, 6 +; GFX8-NEXT: v_readlane_b32 s59, v0, 7 +; GFX8-NEXT: v_readlane_b32 s58, v0, 8 +; GFX8-NEXT: v_readlane_b32 s57, v0, 9 +; GFX8-NEXT: v_readlane_b32 s56, v0, 10 +; GFX8-NEXT: v_readlane_b32 s55, v0, 11 +; GFX8-NEXT: v_readlane_b32 s54, v0, 12 +; GFX8-NEXT: v_readlane_b32 s53, v0, 13 +; GFX8-NEXT: v_readlane_b32 s52, v0, 14 +; GFX8-NEXT: v_readlane_b32 s51, v0, 15 +; GFX8-NEXT: v_readlane_b32 s50, v0, 16 +; GFX8-NEXT: v_readlane_b32 s49, v0, 17 +; GFX8-NEXT: v_readlane_b32 s48, v0, 18 +; GFX8-NEXT: v_readlane_b32 s47, v0, 19 +; GFX8-NEXT: v_readlane_b32 s46, v0, 20 +; GFX8-NEXT: v_readlane_b32 s45, v0, 21 +; GFX8-NEXT: v_readlane_b32 s44, v0, 22 +; GFX8-NEXT: v_readlane_b32 s43, v0, 23 +; GFX8-NEXT: v_readlane_b32 s42, v0, 24 +; GFX8-NEXT: v_readlane_b32 s41, v0, 25 +; GFX8-NEXT: v_readlane_b32 s40, v0, 26 +; GFX8-NEXT: v_readlane_b32 s39, v0, 27 +; GFX8-NEXT: v_readlane_b32 s38, v0, 28 +; GFX8-NEXT: v_readlane_b32 s37, v0, 29 +; GFX8-NEXT: v_readlane_b32 s36, v0, 30 +; GFX8-NEXT: v_readlane_b32 s35, v0, 31 +; GFX8-NEXT: v_readlane_b32 s34, v0, 32 +; GFX8-NEXT: v_readlane_b32 s33, v0, 33 +; GFX8-NEXT: v_readlane_b32 s31, v0, 34 +; GFX8-NEXT: v_readlane_b32 s30, v0, 35 +; GFX8-NEXT: v_readlane_b32 s29, v0, 36 +; GFX8-NEXT: v_readlane_b32 s28, v0, 37 +; GFX8-NEXT: v_readlane_b32 s27, v0, 38 +; GFX8-NEXT: v_readlane_b32 s26, v0, 39 +; GFX8-NEXT: v_readlane_b32 s25, v0, 40 +; GFX8-NEXT: v_readlane_b32 s24, v0, 41 +; GFX8-NEXT: v_readlane_b32 s23, v0, 42 +; GFX8-NEXT: v_readlane_b32 s22, v0, 43 +; GFX8-NEXT: v_readlane_b32 s21, v0, 44 +; GFX8-NEXT: v_readlane_b32 s20, v0, 45 +; GFX8-NEXT: v_readlane_b32 s19, v0, 46 +; GFX8-NEXT: v_readlane_b32 s18, v0, 47 +; GFX8-NEXT: v_readlane_b32 s17, v0, 48 +; GFX8-NEXT: v_readlane_b32 s16, v0, 49 +; GFX8-NEXT: v_readlane_b32 s15, v0, 50 +; GFX8-NEXT: v_readlane_b32 s14, v0, 51 +; GFX8-NEXT: v_readlane_b32 s13, v0, 52 +; GFX8-NEXT: v_readlane_b32 s12, v0, 53 +; GFX8-NEXT: v_readlane_b32 s11, v0, 54 +; GFX8-NEXT: v_readlane_b32 s10, v0, 55 +; GFX8-NEXT: v_readlane_b32 s9, v0, 56 +; GFX8-NEXT: v_readlane_b32 s8, v0, 57 +; GFX8-NEXT: v_readlane_b32 s7, v0, 58 +; GFX8-NEXT: v_readlane_b32 s6, v0, 59 +; GFX8-NEXT: v_readlane_b32 s5, v0, 60 +; GFX8-NEXT: v_readlane_b32 s4, v0, 61 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: v_readlane_b32 s2, v0, 63 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s67, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[68:69], vcc ; GFX8-NEXT: s_cbranch_execz .LBB3_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_bitcmp1_b32 s0, 0 +; GFX8-NEXT: s_cselect_b32 s66, s66, 0 +; GFX8-NEXT: s_bitcmp1_b32 s0, 1 +; GFX8-NEXT: s_cselect_b32 s65, s65, 0 +; GFX8-NEXT: s_add_i32 s65, s66, s65 +; GFX8-NEXT: s_bitcmp1_b32 s0, 2 +; GFX8-NEXT: s_cselect_b32 s64, s64, 0 +; GFX8-NEXT: s_add_i32 s64, s65, s64 +; GFX8-NEXT: s_bitcmp1_b32 s0, 3 +; GFX8-NEXT: s_cselect_b32 s63, s63, 0 +; GFX8-NEXT: s_add_i32 s63, s64, s63 +; GFX8-NEXT: s_bitcmp1_b32 s0, 4 +; GFX8-NEXT: s_cselect_b32 s62, s62, 0 +; GFX8-NEXT: s_add_i32 s62, s63, s62 +; GFX8-NEXT: s_bitcmp1_b32 s0, 5 +; GFX8-NEXT: s_cselect_b32 s61, s61, 0 +; GFX8-NEXT: s_add_i32 s61, s62, s61 +; GFX8-NEXT: s_bitcmp1_b32 s0, 6 +; GFX8-NEXT: s_cselect_b32 s60, s60, 0 +; GFX8-NEXT: s_add_i32 s60, s61, s60 +; GFX8-NEXT: s_bitcmp1_b32 s0, 7 +; GFX8-NEXT: s_cselect_b32 s59, s59, 0 +; GFX8-NEXT: s_add_i32 s59, s60, s59 +; GFX8-NEXT: s_bitcmp1_b32 s0, 8 +; GFX8-NEXT: s_cselect_b32 s58, s58, 0 +; GFX8-NEXT: s_add_i32 s58, s59, s58 +; GFX8-NEXT: s_bitcmp1_b32 s0, 9 +; GFX8-NEXT: s_cselect_b32 s57, s57, 0 +; GFX8-NEXT: s_add_i32 s57, s58, s57 +; GFX8-NEXT: s_bitcmp1_b32 s0, 10 +; GFX8-NEXT: s_cselect_b32 s56, s56, 0 +; GFX8-NEXT: s_add_i32 s56, s57, s56 +; GFX8-NEXT: s_bitcmp1_b32 s0, 11 +; GFX8-NEXT: s_cselect_b32 s55, s55, 0 +; GFX8-NEXT: s_add_i32 s55, s56, s55 +; GFX8-NEXT: s_bitcmp1_b32 s0, 12 +; GFX8-NEXT: s_cselect_b32 s54, s54, 0 +; GFX8-NEXT: s_add_i32 s54, s55, s54 +; GFX8-NEXT: s_bitcmp1_b32 s0, 13 +; GFX8-NEXT: s_cselect_b32 s53, s53, 0 +; GFX8-NEXT: s_add_i32 s53, s54, s53 +; GFX8-NEXT: s_bitcmp1_b32 s0, 14 +; GFX8-NEXT: s_cselect_b32 s52, s52, 0 +; GFX8-NEXT: s_add_i32 s52, s53, s52 +; GFX8-NEXT: s_bitcmp1_b32 s0, 15 +; GFX8-NEXT: s_cselect_b32 s51, s51, 0 +; GFX8-NEXT: s_add_i32 s51, s52, s51 +; GFX8-NEXT: s_bitcmp1_b32 s0, 16 +; GFX8-NEXT: s_cselect_b32 s50, s50, 0 +; GFX8-NEXT: s_add_i32 s50, s51, s50 +; GFX8-NEXT: s_bitcmp1_b32 s0, 17 +; GFX8-NEXT: s_cselect_b32 s49, s49, 0 +; GFX8-NEXT: s_add_i32 s49, s50, s49 +; GFX8-NEXT: s_bitcmp1_b32 s0, 18 +; GFX8-NEXT: s_cselect_b32 s48, s48, 0 +; GFX8-NEXT: s_add_i32 s48, s49, s48 +; GFX8-NEXT: s_bitcmp1_b32 s0, 19 +; GFX8-NEXT: s_cselect_b32 s47, s47, 0 +; GFX8-NEXT: s_add_i32 s47, s48, s47 +; GFX8-NEXT: s_bitcmp1_b32 s0, 20 +; GFX8-NEXT: s_cselect_b32 s46, s46, 0 +; GFX8-NEXT: s_add_i32 s46, s47, s46 +; GFX8-NEXT: s_bitcmp1_b32 s0, 21 +; GFX8-NEXT: s_cselect_b32 s45, s45, 0 +; GFX8-NEXT: s_add_i32 s45, s46, s45 +; GFX8-NEXT: s_bitcmp1_b32 s0, 22 +; GFX8-NEXT: s_cselect_b32 s44, s44, 0 +; GFX8-NEXT: s_add_i32 s44, s45, s44 +; GFX8-NEXT: s_bitcmp1_b32 s0, 23 +; GFX8-NEXT: s_cselect_b32 s43, s43, 0 +; GFX8-NEXT: s_add_i32 s43, s44, s43 +; GFX8-NEXT: s_bitcmp1_b32 s0, 24 +; GFX8-NEXT: s_cselect_b32 s42, s42, 0 +; GFX8-NEXT: s_add_i32 s42, s43, s42 +; GFX8-NEXT: s_bitcmp1_b32 s0, 25 +; GFX8-NEXT: s_cselect_b32 s41, s41, 0 +; GFX8-NEXT: s_add_i32 s41, s42, s41 +; GFX8-NEXT: s_bitcmp1_b32 s0, 26 +; GFX8-NEXT: s_cselect_b32 s40, s40, 0 +; GFX8-NEXT: s_add_i32 s40, s41, s40 +; GFX8-NEXT: s_bitcmp1_b32 s0, 27 +; GFX8-NEXT: s_cselect_b32 s39, s39, 0 +; GFX8-NEXT: s_add_i32 s39, s40, s39 +; GFX8-NEXT: s_bitcmp1_b32 s0, 28 +; GFX8-NEXT: s_cselect_b32 s38, s38, 0 +; GFX8-NEXT: s_add_i32 s38, s39, s38 +; GFX8-NEXT: s_bitcmp1_b32 s0, 29 +; GFX8-NEXT: s_cselect_b32 s37, s37, 0 +; GFX8-NEXT: s_add_i32 s37, s38, s37 +; GFX8-NEXT: s_bitcmp1_b32 s0, 30 +; GFX8-NEXT: s_cselect_b32 s36, s36, 0 +; GFX8-NEXT: s_add_i32 s36, s37, s36 +; GFX8-NEXT: s_bitcmp1_b32 s0, 31 +; GFX8-NEXT: s_cselect_b32 s0, s35, 0 +; GFX8-NEXT: s_add_i32 s0, s36, s0 +; GFX8-NEXT: s_bitcmp1_b32 s1, 0 +; GFX8-NEXT: s_cselect_b32 s34, s34, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s34 +; GFX8-NEXT: s_bitcmp1_b32 s1, 1 +; GFX8-NEXT: s_cselect_b32 s33, s33, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s33 +; GFX8-NEXT: s_bitcmp1_b32 s1, 2 +; GFX8-NEXT: s_cselect_b32 s31, s31, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s31 +; GFX8-NEXT: s_bitcmp1_b32 s1, 3 +; GFX8-NEXT: s_cselect_b32 s30, s30, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s30 +; GFX8-NEXT: s_bitcmp1_b32 s1, 4 +; GFX8-NEXT: s_cselect_b32 s29, s29, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s29 +; GFX8-NEXT: s_bitcmp1_b32 s1, 5 +; GFX8-NEXT: s_cselect_b32 s28, s28, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s28 +; GFX8-NEXT: s_bitcmp1_b32 s1, 6 +; GFX8-NEXT: s_cselect_b32 s27, s27, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s27 +; GFX8-NEXT: s_bitcmp1_b32 s1, 7 +; GFX8-NEXT: s_cselect_b32 s26, s26, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s26 +; GFX8-NEXT: s_bitcmp1_b32 s1, 8 +; GFX8-NEXT: s_cselect_b32 s25, s25, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s25 +; GFX8-NEXT: s_bitcmp1_b32 s1, 9 +; GFX8-NEXT: s_cselect_b32 s24, s24, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s24 +; GFX8-NEXT: s_bitcmp1_b32 s1, 10 +; GFX8-NEXT: s_cselect_b32 s23, s23, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s23 +; GFX8-NEXT: s_bitcmp1_b32 s1, 11 +; GFX8-NEXT: s_cselect_b32 s22, s22, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s22 +; GFX8-NEXT: s_bitcmp1_b32 s1, 12 +; GFX8-NEXT: s_cselect_b32 s21, s21, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s21 +; GFX8-NEXT: s_bitcmp1_b32 s1, 13 +; GFX8-NEXT: s_cselect_b32 s20, s20, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s20 +; GFX8-NEXT: s_bitcmp1_b32 s1, 14 +; GFX8-NEXT: s_cselect_b32 s19, s19, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s19 +; GFX8-NEXT: s_bitcmp1_b32 s1, 15 +; GFX8-NEXT: s_cselect_b32 s18, s18, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s18 +; GFX8-NEXT: s_bitcmp1_b32 s1, 16 +; GFX8-NEXT: s_cselect_b32 s17, s17, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s17 +; GFX8-NEXT: s_bitcmp1_b32 s1, 17 +; GFX8-NEXT: s_cselect_b32 s16, s16, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s16 +; GFX8-NEXT: s_bitcmp1_b32 s1, 18 +; GFX8-NEXT: s_cselect_b32 s15, s15, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s15 +; GFX8-NEXT: s_bitcmp1_b32 s1, 19 +; GFX8-NEXT: s_cselect_b32 s14, s14, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s14 +; GFX8-NEXT: s_bitcmp1_b32 s1, 20 +; GFX8-NEXT: s_cselect_b32 s13, s13, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s13 +; GFX8-NEXT: s_bitcmp1_b32 s1, 21 +; GFX8-NEXT: s_cselect_b32 s12, s12, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s12 +; GFX8-NEXT: s_bitcmp1_b32 s1, 22 +; GFX8-NEXT: s_cselect_b32 s11, s11, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s11 +; GFX8-NEXT: s_bitcmp1_b32 s1, 23 +; GFX8-NEXT: s_cselect_b32 s10, s10, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s10 +; GFX8-NEXT: s_bitcmp1_b32 s1, 24 +; GFX8-NEXT: s_cselect_b32 s9, s9, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s9 +; GFX8-NEXT: s_bitcmp1_b32 s1, 25 +; GFX8-NEXT: s_cselect_b32 s8, s8, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s8 +; GFX8-NEXT: s_bitcmp1_b32 s1, 26 +; GFX8-NEXT: s_cselect_b32 s7, s7, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_bitcmp1_b32 s1, 27 +; GFX8-NEXT: s_cselect_b32 s6, s6, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s6 +; GFX8-NEXT: s_bitcmp1_b32 s1, 28 +; GFX8-NEXT: s_cselect_b32 s5, s5, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_bitcmp1_b32 s1, 29 +; GFX8-NEXT: s_cselect_b32 s4, s4, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s4 +; GFX8-NEXT: s_bitcmp1_b32 s1, 30 +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s3 +; GFX8-NEXT: s_bitcmp1_b32 s1, 31 +; GFX8-NEXT: s_cselect_b32 s1, s2, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_u32 v2, v0 +; GFX8-NEXT: ds_add_u32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB3_2: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v1, 63 -; GFX9-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s67, v1 +; GFX9-NEXT: v_readlane_b32 s66, v0, 0 +; GFX9-NEXT: v_readlane_b32 s65, v0, 1 +; GFX9-NEXT: v_readlane_b32 s64, v0, 2 +; GFX9-NEXT: v_readlane_b32 s63, v0, 3 +; GFX9-NEXT: v_readlane_b32 s62, v0, 4 +; GFX9-NEXT: v_readlane_b32 s61, v0, 5 +; GFX9-NEXT: v_readlane_b32 s60, v0, 6 +; GFX9-NEXT: v_readlane_b32 s59, v0, 7 +; GFX9-NEXT: v_readlane_b32 s58, v0, 8 +; GFX9-NEXT: v_readlane_b32 s57, v0, 9 +; GFX9-NEXT: v_readlane_b32 s56, v0, 10 +; GFX9-NEXT: v_readlane_b32 s55, v0, 11 +; GFX9-NEXT: v_readlane_b32 s54, v0, 12 +; GFX9-NEXT: v_readlane_b32 s53, v0, 13 +; GFX9-NEXT: v_readlane_b32 s52, v0, 14 +; GFX9-NEXT: v_readlane_b32 s51, v0, 15 +; GFX9-NEXT: v_readlane_b32 s50, v0, 16 +; GFX9-NEXT: v_readlane_b32 s49, v0, 17 +; GFX9-NEXT: v_readlane_b32 s48, v0, 18 +; GFX9-NEXT: v_readlane_b32 s47, v0, 19 +; GFX9-NEXT: v_readlane_b32 s46, v0, 20 +; GFX9-NEXT: v_readlane_b32 s45, v0, 21 +; GFX9-NEXT: v_readlane_b32 s44, v0, 22 +; GFX9-NEXT: v_readlane_b32 s43, v0, 23 +; GFX9-NEXT: v_readlane_b32 s42, v0, 24 +; GFX9-NEXT: v_readlane_b32 s41, v0, 25 +; GFX9-NEXT: v_readlane_b32 s40, v0, 26 +; GFX9-NEXT: v_readlane_b32 s39, v0, 27 +; GFX9-NEXT: v_readlane_b32 s38, v0, 28 +; GFX9-NEXT: v_readlane_b32 s37, v0, 29 +; GFX9-NEXT: v_readlane_b32 s36, v0, 30 +; GFX9-NEXT: v_readlane_b32 s35, v0, 31 +; GFX9-NEXT: v_readlane_b32 s34, v0, 32 +; GFX9-NEXT: v_readlane_b32 s33, v0, 33 +; GFX9-NEXT: v_readlane_b32 s31, v0, 34 +; GFX9-NEXT: v_readlane_b32 s30, v0, 35 +; GFX9-NEXT: v_readlane_b32 s29, v0, 36 +; GFX9-NEXT: v_readlane_b32 s28, v0, 37 +; GFX9-NEXT: v_readlane_b32 s27, v0, 38 +; GFX9-NEXT: v_readlane_b32 s26, v0, 39 +; GFX9-NEXT: v_readlane_b32 s25, v0, 40 +; GFX9-NEXT: v_readlane_b32 s24, v0, 41 +; GFX9-NEXT: v_readlane_b32 s23, v0, 42 +; GFX9-NEXT: v_readlane_b32 s22, v0, 43 +; GFX9-NEXT: v_readlane_b32 s21, v0, 44 +; GFX9-NEXT: v_readlane_b32 s20, v0, 45 +; GFX9-NEXT: v_readlane_b32 s19, v0, 46 +; GFX9-NEXT: v_readlane_b32 s18, v0, 47 +; GFX9-NEXT: v_readlane_b32 s17, v0, 48 +; GFX9-NEXT: v_readlane_b32 s16, v0, 49 +; GFX9-NEXT: v_readlane_b32 s15, v0, 50 +; GFX9-NEXT: v_readlane_b32 s14, v0, 51 +; GFX9-NEXT: v_readlane_b32 s13, v0, 52 +; GFX9-NEXT: v_readlane_b32 s12, v0, 53 +; GFX9-NEXT: v_readlane_b32 s11, v0, 54 +; GFX9-NEXT: v_readlane_b32 s10, v0, 55 +; GFX9-NEXT: v_readlane_b32 s9, v0, 56 +; GFX9-NEXT: v_readlane_b32 s8, v0, 57 +; GFX9-NEXT: v_readlane_b32 s7, v0, 58 +; GFX9-NEXT: v_readlane_b32 s6, v0, 59 +; GFX9-NEXT: v_readlane_b32 s5, v0, 60 +; GFX9-NEXT: v_readlane_b32 s4, v0, 61 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: v_readlane_b32 s2, v0, 63 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s67, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[68:69], vcc ; GFX9-NEXT: s_cbranch_execz .LBB3_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_bitcmp1_b32 s0, 0 +; GFX9-NEXT: s_cselect_b32 s66, s66, 0 +; GFX9-NEXT: s_bitcmp1_b32 s0, 1 +; GFX9-NEXT: s_cselect_b32 s65, s65, 0 +; GFX9-NEXT: s_add_i32 s65, s66, s65 +; GFX9-NEXT: s_bitcmp1_b32 s0, 2 +; GFX9-NEXT: s_cselect_b32 s64, s64, 0 +; GFX9-NEXT: s_add_i32 s64, s65, s64 +; GFX9-NEXT: s_bitcmp1_b32 s0, 3 +; GFX9-NEXT: s_cselect_b32 s63, s63, 0 +; GFX9-NEXT: s_add_i32 s63, s64, s63 +; GFX9-NEXT: s_bitcmp1_b32 s0, 4 +; GFX9-NEXT: s_cselect_b32 s62, s62, 0 +; GFX9-NEXT: s_add_i32 s62, s63, s62 +; GFX9-NEXT: s_bitcmp1_b32 s0, 5 +; GFX9-NEXT: s_cselect_b32 s61, s61, 0 +; GFX9-NEXT: s_add_i32 s61, s62, s61 +; GFX9-NEXT: s_bitcmp1_b32 s0, 6 +; GFX9-NEXT: s_cselect_b32 s60, s60, 0 +; GFX9-NEXT: s_add_i32 s60, s61, s60 +; GFX9-NEXT: s_bitcmp1_b32 s0, 7 +; GFX9-NEXT: s_cselect_b32 s59, s59, 0 +; GFX9-NEXT: s_add_i32 s59, s60, s59 +; GFX9-NEXT: s_bitcmp1_b32 s0, 8 +; GFX9-NEXT: s_cselect_b32 s58, s58, 0 +; GFX9-NEXT: s_add_i32 s58, s59, s58 +; GFX9-NEXT: s_bitcmp1_b32 s0, 9 +; GFX9-NEXT: s_cselect_b32 s57, s57, 0 +; GFX9-NEXT: s_add_i32 s57, s58, s57 +; GFX9-NEXT: s_bitcmp1_b32 s0, 10 +; GFX9-NEXT: s_cselect_b32 s56, s56, 0 +; GFX9-NEXT: s_add_i32 s56, s57, s56 +; GFX9-NEXT: s_bitcmp1_b32 s0, 11 +; GFX9-NEXT: s_cselect_b32 s55, s55, 0 +; GFX9-NEXT: s_add_i32 s55, s56, s55 +; GFX9-NEXT: s_bitcmp1_b32 s0, 12 +; GFX9-NEXT: s_cselect_b32 s54, s54, 0 +; GFX9-NEXT: s_add_i32 s54, s55, s54 +; GFX9-NEXT: s_bitcmp1_b32 s0, 13 +; GFX9-NEXT: s_cselect_b32 s53, s53, 0 +; GFX9-NEXT: s_add_i32 s53, s54, s53 +; GFX9-NEXT: s_bitcmp1_b32 s0, 14 +; GFX9-NEXT: s_cselect_b32 s52, s52, 0 +; GFX9-NEXT: s_add_i32 s52, s53, s52 +; GFX9-NEXT: s_bitcmp1_b32 s0, 15 +; GFX9-NEXT: s_cselect_b32 s51, s51, 0 +; GFX9-NEXT: s_add_i32 s51, s52, s51 +; GFX9-NEXT: s_bitcmp1_b32 s0, 16 +; GFX9-NEXT: s_cselect_b32 s50, s50, 0 +; GFX9-NEXT: s_add_i32 s50, s51, s50 +; GFX9-NEXT: s_bitcmp1_b32 s0, 17 +; GFX9-NEXT: s_cselect_b32 s49, s49, 0 +; GFX9-NEXT: s_add_i32 s49, s50, s49 +; GFX9-NEXT: s_bitcmp1_b32 s0, 18 +; GFX9-NEXT: s_cselect_b32 s48, s48, 0 +; GFX9-NEXT: s_add_i32 s48, s49, s48 +; GFX9-NEXT: s_bitcmp1_b32 s0, 19 +; GFX9-NEXT: s_cselect_b32 s47, s47, 0 +; GFX9-NEXT: s_add_i32 s47, s48, s47 +; GFX9-NEXT: s_bitcmp1_b32 s0, 20 +; GFX9-NEXT: s_cselect_b32 s46, s46, 0 +; GFX9-NEXT: s_add_i32 s46, s47, s46 +; GFX9-NEXT: s_bitcmp1_b32 s0, 21 +; GFX9-NEXT: s_cselect_b32 s45, s45, 0 +; GFX9-NEXT: s_add_i32 s45, s46, s45 +; GFX9-NEXT: s_bitcmp1_b32 s0, 22 +; GFX9-NEXT: s_cselect_b32 s44, s44, 0 +; GFX9-NEXT: s_add_i32 s44, s45, s44 +; GFX9-NEXT: s_bitcmp1_b32 s0, 23 +; GFX9-NEXT: s_cselect_b32 s43, s43, 0 +; GFX9-NEXT: s_add_i32 s43, s44, s43 +; GFX9-NEXT: s_bitcmp1_b32 s0, 24 +; GFX9-NEXT: s_cselect_b32 s42, s42, 0 +; GFX9-NEXT: s_add_i32 s42, s43, s42 +; GFX9-NEXT: s_bitcmp1_b32 s0, 25 +; GFX9-NEXT: s_cselect_b32 s41, s41, 0 +; GFX9-NEXT: s_add_i32 s41, s42, s41 +; GFX9-NEXT: s_bitcmp1_b32 s0, 26 +; GFX9-NEXT: s_cselect_b32 s40, s40, 0 +; GFX9-NEXT: s_add_i32 s40, s41, s40 +; GFX9-NEXT: s_bitcmp1_b32 s0, 27 +; GFX9-NEXT: s_cselect_b32 s39, s39, 0 +; GFX9-NEXT: s_add_i32 s39, s40, s39 +; GFX9-NEXT: s_bitcmp1_b32 s0, 28 +; GFX9-NEXT: s_cselect_b32 s38, s38, 0 +; GFX9-NEXT: s_add_i32 s38, s39, s38 +; GFX9-NEXT: s_bitcmp1_b32 s0, 29 +; GFX9-NEXT: s_cselect_b32 s37, s37, 0 +; GFX9-NEXT: s_add_i32 s37, s38, s37 +; GFX9-NEXT: s_bitcmp1_b32 s0, 30 +; GFX9-NEXT: s_cselect_b32 s36, s36, 0 +; GFX9-NEXT: s_add_i32 s36, s37, s36 +; GFX9-NEXT: s_bitcmp1_b32 s0, 31 +; GFX9-NEXT: s_cselect_b32 s0, s35, 0 +; GFX9-NEXT: s_add_i32 s0, s36, s0 +; GFX9-NEXT: s_bitcmp1_b32 s1, 0 +; GFX9-NEXT: s_cselect_b32 s34, s34, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s34 +; GFX9-NEXT: s_bitcmp1_b32 s1, 1 +; GFX9-NEXT: s_cselect_b32 s33, s33, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s33 +; GFX9-NEXT: s_bitcmp1_b32 s1, 2 +; GFX9-NEXT: s_cselect_b32 s31, s31, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s31 +; GFX9-NEXT: s_bitcmp1_b32 s1, 3 +; GFX9-NEXT: s_cselect_b32 s30, s30, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s30 +; GFX9-NEXT: s_bitcmp1_b32 s1, 4 +; GFX9-NEXT: s_cselect_b32 s29, s29, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s29 +; GFX9-NEXT: s_bitcmp1_b32 s1, 5 +; GFX9-NEXT: s_cselect_b32 s28, s28, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s28 +; GFX9-NEXT: s_bitcmp1_b32 s1, 6 +; GFX9-NEXT: s_cselect_b32 s27, s27, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s27 +; GFX9-NEXT: s_bitcmp1_b32 s1, 7 +; GFX9-NEXT: s_cselect_b32 s26, s26, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s26 +; GFX9-NEXT: s_bitcmp1_b32 s1, 8 +; GFX9-NEXT: s_cselect_b32 s25, s25, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s25 +; GFX9-NEXT: s_bitcmp1_b32 s1, 9 +; GFX9-NEXT: s_cselect_b32 s24, s24, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s24 +; GFX9-NEXT: s_bitcmp1_b32 s1, 10 +; GFX9-NEXT: s_cselect_b32 s23, s23, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s23 +; GFX9-NEXT: s_bitcmp1_b32 s1, 11 +; GFX9-NEXT: s_cselect_b32 s22, s22, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s22 +; GFX9-NEXT: s_bitcmp1_b32 s1, 12 +; GFX9-NEXT: s_cselect_b32 s21, s21, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s21 +; GFX9-NEXT: s_bitcmp1_b32 s1, 13 +; GFX9-NEXT: s_cselect_b32 s20, s20, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s20 +; GFX9-NEXT: s_bitcmp1_b32 s1, 14 +; GFX9-NEXT: s_cselect_b32 s19, s19, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s19 +; GFX9-NEXT: s_bitcmp1_b32 s1, 15 +; GFX9-NEXT: s_cselect_b32 s18, s18, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s18 +; GFX9-NEXT: s_bitcmp1_b32 s1, 16 +; GFX9-NEXT: s_cselect_b32 s17, s17, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s17 +; GFX9-NEXT: s_bitcmp1_b32 s1, 17 +; GFX9-NEXT: s_cselect_b32 s16, s16, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s16 +; GFX9-NEXT: s_bitcmp1_b32 s1, 18 +; GFX9-NEXT: s_cselect_b32 s15, s15, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s15 +; GFX9-NEXT: s_bitcmp1_b32 s1, 19 +; GFX9-NEXT: s_cselect_b32 s14, s14, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s14 +; GFX9-NEXT: s_bitcmp1_b32 s1, 20 +; GFX9-NEXT: s_cselect_b32 s13, s13, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s13 +; GFX9-NEXT: s_bitcmp1_b32 s1, 21 +; GFX9-NEXT: s_cselect_b32 s12, s12, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s12 +; GFX9-NEXT: s_bitcmp1_b32 s1, 22 +; GFX9-NEXT: s_cselect_b32 s11, s11, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s11 +; GFX9-NEXT: s_bitcmp1_b32 s1, 23 +; GFX9-NEXT: s_cselect_b32 s10, s10, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s10 +; GFX9-NEXT: s_bitcmp1_b32 s1, 24 +; GFX9-NEXT: s_cselect_b32 s9, s9, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s9 +; GFX9-NEXT: s_bitcmp1_b32 s1, 25 +; GFX9-NEXT: s_cselect_b32 s8, s8, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s8 +; GFX9-NEXT: s_bitcmp1_b32 s1, 26 +; GFX9-NEXT: s_cselect_b32 s7, s7, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s7 +; GFX9-NEXT: s_bitcmp1_b32 s1, 27 +; GFX9-NEXT: s_cselect_b32 s6, s6, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_bitcmp1_b32 s1, 28 +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s5 +; GFX9-NEXT: s_bitcmp1_b32 s1, 29 +; GFX9-NEXT: s_cselect_b32 s4, s4, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_bitcmp1_b32 s1, 30 +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s3 +; GFX9-NEXT: s_bitcmp1_b32 s1, 31 +; GFX9-NEXT: s_cselect_b32 s1, s2, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_u32 v2, v0 +; GFX9-NEXT: ds_add_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB3_2: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_varying_nouse: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 -; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_readlane_b32 s66, v0, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX1064-NEXT: v_readlane_b32 s65, v0, 1 +; GFX1064-NEXT: v_readlane_b32 s64, v0, 2 +; GFX1064-NEXT: v_readlane_b32 s63, v0, 3 +; GFX1064-NEXT: v_readlane_b32 s62, v0, 4 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GFX1064-NEXT: v_readlane_b32 s61, v0, 5 +; GFX1064-NEXT: v_readlane_b32 s60, v0, 6 +; GFX1064-NEXT: v_readlane_b32 s59, v0, 7 +; GFX1064-NEXT: v_readlane_b32 s58, v0, 8 +; GFX1064-NEXT: v_readfirstlane_b32 s67, v1 +; GFX1064-NEXT: v_readlane_b32 s57, v0, 9 +; GFX1064-NEXT: v_readlane_b32 s56, v0, 10 +; GFX1064-NEXT: v_readlane_b32 s55, v0, 11 +; GFX1064-NEXT: v_readlane_b32 s54, v0, 12 +; GFX1064-NEXT: v_readlane_b32 s53, v0, 13 +; GFX1064-NEXT: v_readlane_b32 s52, v0, 14 +; GFX1064-NEXT: v_readlane_b32 s51, v0, 15 +; GFX1064-NEXT: v_readlane_b32 s50, v0, 16 +; GFX1064-NEXT: v_readlane_b32 s49, v0, 17 +; GFX1064-NEXT: v_readlane_b32 s48, v0, 18 +; GFX1064-NEXT: v_readlane_b32 s47, v0, 19 +; GFX1064-NEXT: v_readlane_b32 s46, v0, 20 +; GFX1064-NEXT: v_readlane_b32 s45, v0, 21 +; GFX1064-NEXT: v_readlane_b32 s44, v0, 22 +; GFX1064-NEXT: v_readlane_b32 s43, v0, 23 +; GFX1064-NEXT: v_readlane_b32 s42, v0, 24 +; GFX1064-NEXT: v_readlane_b32 s41, v0, 25 +; GFX1064-NEXT: v_readlane_b32 s40, v0, 26 +; GFX1064-NEXT: v_readlane_b32 s39, v0, 27 +; GFX1064-NEXT: v_readlane_b32 s38, v0, 28 +; GFX1064-NEXT: v_readlane_b32 s37, v0, 29 +; GFX1064-NEXT: v_readlane_b32 s36, v0, 30 +; GFX1064-NEXT: v_readlane_b32 s35, v0, 31 +; GFX1064-NEXT: v_readlane_b32 s34, v0, 32 +; GFX1064-NEXT: v_readlane_b32 s33, v0, 33 +; GFX1064-NEXT: v_readlane_b32 s31, v0, 34 +; GFX1064-NEXT: v_readlane_b32 s30, v0, 35 +; GFX1064-NEXT: v_readlane_b32 s29, v0, 36 +; GFX1064-NEXT: v_readlane_b32 s28, v0, 37 +; GFX1064-NEXT: v_readlane_b32 s27, v0, 38 +; GFX1064-NEXT: v_readlane_b32 s26, v0, 39 +; GFX1064-NEXT: v_readlane_b32 s25, v0, 40 +; GFX1064-NEXT: v_readlane_b32 s24, v0, 41 +; GFX1064-NEXT: v_readlane_b32 s23, v0, 42 +; GFX1064-NEXT: v_readlane_b32 s22, v0, 43 +; GFX1064-NEXT: v_readlane_b32 s21, v0, 44 +; GFX1064-NEXT: v_readlane_b32 s20, v0, 45 +; GFX1064-NEXT: v_readlane_b32 s19, v0, 46 +; GFX1064-NEXT: v_readlane_b32 s18, v0, 47 +; GFX1064-NEXT: v_readlane_b32 s17, v0, 48 +; GFX1064-NEXT: v_readlane_b32 s16, v0, 49 +; GFX1064-NEXT: v_readlane_b32 s15, v0, 50 +; GFX1064-NEXT: v_readlane_b32 s14, v0, 51 +; GFX1064-NEXT: v_readlane_b32 s13, v0, 52 +; GFX1064-NEXT: v_readlane_b32 s12, v0, 53 +; GFX1064-NEXT: v_readlane_b32 s11, v0, 54 +; GFX1064-NEXT: v_readlane_b32 s10, v0, 55 +; GFX1064-NEXT: v_readlane_b32 s9, v0, 56 +; GFX1064-NEXT: v_readlane_b32 s8, v0, 57 +; GFX1064-NEXT: v_readlane_b32 s6, v0, 58 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 59 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: v_readlane_b32 s4, v0, 61 +; GFX1064-NEXT: v_readlane_b32 s3, v0, 62 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 63 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s67, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[68:69], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB3_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: s_cselect_b32 s66, s66, 0 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 1 +; GFX1064-NEXT: s_cselect_b32 s65, s65, 0 +; GFX1064-NEXT: s_add_i32 s65, s66, s65 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 2 +; GFX1064-NEXT: s_cselect_b32 s64, s64, 0 +; GFX1064-NEXT: s_add_i32 s64, s65, s64 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 3 +; GFX1064-NEXT: s_cselect_b32 s63, s63, 0 +; GFX1064-NEXT: s_add_i32 s63, s64, s63 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 4 +; GFX1064-NEXT: s_cselect_b32 s62, s62, 0 +; GFX1064-NEXT: s_add_i32 s62, s63, s62 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 5 +; GFX1064-NEXT: s_cselect_b32 s61, s61, 0 +; GFX1064-NEXT: s_add_i32 s61, s62, s61 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 6 +; GFX1064-NEXT: s_cselect_b32 s60, s60, 0 +; GFX1064-NEXT: s_add_i32 s60, s61, s60 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 7 +; GFX1064-NEXT: s_cselect_b32 s59, s59, 0 +; GFX1064-NEXT: s_add_i32 s59, s60, s59 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 8 +; GFX1064-NEXT: s_cselect_b32 s58, s58, 0 +; GFX1064-NEXT: s_add_i32 s58, s59, s58 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 9 +; GFX1064-NEXT: s_cselect_b32 s57, s57, 0 +; GFX1064-NEXT: s_add_i32 s57, s58, s57 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 10 +; GFX1064-NEXT: s_cselect_b32 s56, s56, 0 +; GFX1064-NEXT: s_add_i32 s56, s57, s56 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 11 +; GFX1064-NEXT: s_cselect_b32 s55, s55, 0 +; GFX1064-NEXT: s_add_i32 s55, s56, s55 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 12 +; GFX1064-NEXT: s_cselect_b32 s54, s54, 0 +; GFX1064-NEXT: s_add_i32 s54, s55, s54 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 13 +; GFX1064-NEXT: s_cselect_b32 s53, s53, 0 +; GFX1064-NEXT: s_add_i32 s53, s54, s53 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 14 +; GFX1064-NEXT: s_cselect_b32 s52, s52, 0 +; GFX1064-NEXT: s_add_i32 s52, s53, s52 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 15 +; GFX1064-NEXT: s_cselect_b32 s51, s51, 0 +; GFX1064-NEXT: s_add_i32 s51, s52, s51 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 16 +; GFX1064-NEXT: s_cselect_b32 s50, s50, 0 +; GFX1064-NEXT: s_add_i32 s50, s51, s50 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 17 +; GFX1064-NEXT: s_cselect_b32 s49, s49, 0 +; GFX1064-NEXT: s_add_i32 s49, s50, s49 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 18 +; GFX1064-NEXT: s_cselect_b32 s48, s48, 0 +; GFX1064-NEXT: s_add_i32 s48, s49, s48 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 19 +; GFX1064-NEXT: s_cselect_b32 s47, s47, 0 +; GFX1064-NEXT: s_add_i32 s47, s48, s47 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 20 +; GFX1064-NEXT: s_cselect_b32 s46, s46, 0 +; GFX1064-NEXT: s_add_i32 s46, s47, s46 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 21 +; GFX1064-NEXT: s_cselect_b32 s45, s45, 0 +; GFX1064-NEXT: s_add_i32 s45, s46, s45 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 22 +; GFX1064-NEXT: s_cselect_b32 s44, s44, 0 +; GFX1064-NEXT: s_add_i32 s44, s45, s44 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 23 +; GFX1064-NEXT: s_cselect_b32 s43, s43, 0 +; GFX1064-NEXT: s_add_i32 s43, s44, s43 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 24 +; GFX1064-NEXT: s_cselect_b32 s42, s42, 0 +; GFX1064-NEXT: s_add_i32 s42, s43, s42 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 25 +; GFX1064-NEXT: s_cselect_b32 s41, s41, 0 +; GFX1064-NEXT: s_add_i32 s41, s42, s41 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 26 +; GFX1064-NEXT: s_cselect_b32 s40, s40, 0 +; GFX1064-NEXT: s_add_i32 s40, s41, s40 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 27 +; GFX1064-NEXT: s_cselect_b32 s39, s39, 0 +; GFX1064-NEXT: s_add_i32 s39, s40, s39 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 28 +; GFX1064-NEXT: s_cselect_b32 s38, s38, 0 +; GFX1064-NEXT: s_add_i32 s38, s39, s38 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 29 +; GFX1064-NEXT: s_cselect_b32 s37, s37, 0 +; GFX1064-NEXT: s_add_i32 s37, s38, s37 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 30 +; GFX1064-NEXT: s_cselect_b32 s36, s36, 0 +; GFX1064-NEXT: s_add_i32 s36, s37, s36 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 31 +; GFX1064-NEXT: s_cselect_b32 s0, s35, 0 +; GFX1064-NEXT: s_add_i32 s0, s36, s0 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 0 +; GFX1064-NEXT: s_cselect_b32 s34, s34, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s34 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 1 +; GFX1064-NEXT: s_cselect_b32 s33, s33, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s33 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 2 +; GFX1064-NEXT: s_cselect_b32 s31, s31, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s31 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 3 +; GFX1064-NEXT: s_cselect_b32 s30, s30, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s30 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 4 +; GFX1064-NEXT: s_cselect_b32 s29, s29, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s29 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 5 +; GFX1064-NEXT: s_cselect_b32 s28, s28, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s28 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 6 +; GFX1064-NEXT: s_cselect_b32 s27, s27, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s27 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 7 +; GFX1064-NEXT: s_cselect_b32 s26, s26, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s26 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 8 +; GFX1064-NEXT: s_cselect_b32 s25, s25, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s25 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 9 +; GFX1064-NEXT: s_cselect_b32 s24, s24, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s24 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 10 +; GFX1064-NEXT: s_cselect_b32 s23, s23, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s23 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 11 +; GFX1064-NEXT: s_cselect_b32 s22, s22, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s22 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 12 +; GFX1064-NEXT: s_cselect_b32 s21, s21, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s21 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 13 +; GFX1064-NEXT: s_cselect_b32 s20, s20, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s20 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 14 +; GFX1064-NEXT: s_cselect_b32 s19, s19, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s19 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 15 +; GFX1064-NEXT: s_cselect_b32 s18, s18, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s18 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 16 +; GFX1064-NEXT: s_cselect_b32 s17, s17, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s17 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 17 +; GFX1064-NEXT: s_cselect_b32 s16, s16, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s16 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 18 +; GFX1064-NEXT: s_cselect_b32 s15, s15, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s15 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 19 +; GFX1064-NEXT: s_cselect_b32 s14, s14, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s14 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 20 +; GFX1064-NEXT: s_cselect_b32 s13, s13, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s13 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 21 +; GFX1064-NEXT: s_cselect_b32 s12, s12, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s12 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 22 +; GFX1064-NEXT: s_cselect_b32 s11, s11, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s11 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 23 +; GFX1064-NEXT: s_cselect_b32 s10, s10, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s10 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 24 +; GFX1064-NEXT: s_cselect_b32 s9, s9, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s9 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 25 +; GFX1064-NEXT: s_cselect_b32 s8, s8, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s8 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 26 +; GFX1064-NEXT: s_cselect_b32 s6, s6, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s6 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 27 +; GFX1064-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s5 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 28 +; GFX1064-NEXT: s_cselect_b32 s5, s7, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s5 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 29 +; GFX1064-NEXT: s_cselect_b32 s4, s4, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s4 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 30 +; GFX1064-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s3 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 31 +; GFX1064-NEXT: s_cselect_b32 s1, s2, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_u32 v0, v3 +; GFX1064-NEXT: ds_add_u32 v0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB3_2: @@ -938,29 +6369,145 @@ ; ; GFX1032-LABEL: add_i32_varying_nouse: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: v_readlane_b32 s33, v0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX1032-NEXT: v_readlane_b32 s31, v0, 1 +; GFX1032-NEXT: v_readlane_b32 s30, v0, 2 +; GFX1032-NEXT: v_readlane_b32 s29, v0, 3 +; GFX1032-NEXT: v_readlane_b32 s28, v0, 4 +; GFX1032-NEXT: v_readfirstlane_b32 s34, v1 +; GFX1032-NEXT: v_readlane_b32 s27, v0, 5 +; GFX1032-NEXT: v_readlane_b32 s26, v0, 6 +; GFX1032-NEXT: v_readlane_b32 s25, v0, 7 +; GFX1032-NEXT: v_readlane_b32 s24, v0, 8 +; GFX1032-NEXT: v_readlane_b32 s23, v0, 9 +; GFX1032-NEXT: v_readlane_b32 s22, v0, 10 +; GFX1032-NEXT: v_readlane_b32 s21, v0, 11 +; GFX1032-NEXT: v_readlane_b32 s20, v0, 12 +; GFX1032-NEXT: v_readlane_b32 s19, v0, 13 +; GFX1032-NEXT: v_readlane_b32 s18, v0, 14 +; GFX1032-NEXT: v_readlane_b32 s17, v0, 15 +; GFX1032-NEXT: v_readlane_b32 s16, v0, 16 +; GFX1032-NEXT: v_readlane_b32 s15, v0, 17 +; GFX1032-NEXT: v_readlane_b32 s14, v0, 18 +; GFX1032-NEXT: v_readlane_b32 s13, v0, 19 +; GFX1032-NEXT: v_readlane_b32 s12, v0, 20 +; GFX1032-NEXT: v_readlane_b32 s11, v0, 21 +; GFX1032-NEXT: v_readlane_b32 s10, v0, 22 +; GFX1032-NEXT: v_readlane_b32 s9, v0, 23 +; GFX1032-NEXT: v_readlane_b32 s8, v0, 24 +; GFX1032-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1032-NEXT: v_readlane_b32 s5, v0, 26 +; GFX1032-NEXT: v_readlane_b32 s4, v0, 27 +; GFX1032-NEXT: v_readlane_b32 s6, v0, 28 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 30 +; GFX1032-NEXT: v_readlane_b32 s1, v0, 31 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s34, v1 +; GFX1032-NEXT: s_and_saveexec_b32 s34, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB3_2 ; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_cselect_b32 s33, s33, 0 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 1 +; GFX1032-NEXT: s_cselect_b32 s31, s31, 0 +; GFX1032-NEXT: s_add_i32 s31, s33, s31 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 2 +; GFX1032-NEXT: s_cselect_b32 s30, s30, 0 +; GFX1032-NEXT: s_add_i32 s30, s31, s30 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 3 +; GFX1032-NEXT: s_cselect_b32 s29, s29, 0 +; GFX1032-NEXT: s_add_i32 s29, s30, s29 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 4 +; GFX1032-NEXT: s_cselect_b32 s28, s28, 0 +; GFX1032-NEXT: s_add_i32 s28, s29, s28 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 5 +; GFX1032-NEXT: s_cselect_b32 s27, s27, 0 +; GFX1032-NEXT: s_add_i32 s27, s28, s27 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 6 +; GFX1032-NEXT: s_cselect_b32 s26, s26, 0 +; GFX1032-NEXT: s_add_i32 s26, s27, s26 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 7 +; GFX1032-NEXT: s_cselect_b32 s25, s25, 0 +; GFX1032-NEXT: s_add_i32 s25, s26, s25 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 8 +; GFX1032-NEXT: s_cselect_b32 s24, s24, 0 +; GFX1032-NEXT: s_add_i32 s24, s25, s24 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 9 +; GFX1032-NEXT: s_cselect_b32 s23, s23, 0 +; GFX1032-NEXT: s_add_i32 s23, s24, s23 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 10 +; GFX1032-NEXT: s_cselect_b32 s22, s22, 0 +; GFX1032-NEXT: s_add_i32 s22, s23, s22 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 11 +; GFX1032-NEXT: s_cselect_b32 s21, s21, 0 +; GFX1032-NEXT: s_add_i32 s21, s22, s21 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 12 +; GFX1032-NEXT: s_cselect_b32 s20, s20, 0 +; GFX1032-NEXT: s_add_i32 s20, s21, s20 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 13 +; GFX1032-NEXT: s_cselect_b32 s19, s19, 0 +; GFX1032-NEXT: s_add_i32 s19, s20, s19 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 14 +; GFX1032-NEXT: s_cselect_b32 s18, s18, 0 +; GFX1032-NEXT: s_add_i32 s18, s19, s18 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 15 +; GFX1032-NEXT: s_cselect_b32 s17, s17, 0 +; GFX1032-NEXT: s_add_i32 s17, s18, s17 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 16 +; GFX1032-NEXT: s_cselect_b32 s16, s16, 0 +; GFX1032-NEXT: s_add_i32 s16, s17, s16 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 17 +; GFX1032-NEXT: s_cselect_b32 s15, s15, 0 +; GFX1032-NEXT: s_add_i32 s15, s16, s15 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 18 +; GFX1032-NEXT: s_cselect_b32 s14, s14, 0 +; GFX1032-NEXT: s_add_i32 s14, s15, s14 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 19 +; GFX1032-NEXT: s_cselect_b32 s13, s13, 0 +; GFX1032-NEXT: s_add_i32 s13, s14, s13 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 20 +; GFX1032-NEXT: s_cselect_b32 s12, s12, 0 +; GFX1032-NEXT: s_add_i32 s12, s13, s12 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 21 +; GFX1032-NEXT: s_cselect_b32 s11, s11, 0 +; GFX1032-NEXT: s_add_i32 s11, s12, s11 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 22 +; GFX1032-NEXT: s_cselect_b32 s10, s10, 0 +; GFX1032-NEXT: s_add_i32 s10, s11, s10 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 23 +; GFX1032-NEXT: s_cselect_b32 s9, s9, 0 +; GFX1032-NEXT: s_add_i32 s9, s10, s9 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 24 +; GFX1032-NEXT: s_cselect_b32 s8, s8, 0 +; GFX1032-NEXT: s_add_i32 s8, s9, s8 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 25 +; GFX1032-NEXT: s_cselect_b32 s7, s7, 0 +; GFX1032-NEXT: s_add_i32 s7, s8, s7 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 26 +; GFX1032-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1032-NEXT: s_add_i32 s5, s7, s5 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 27 +; GFX1032-NEXT: s_cselect_b32 s4, s4, 0 +; GFX1032-NEXT: s_add_i32 s4, s5, s4 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 28 +; GFX1032-NEXT: s_cselect_b32 s5, s6, 0 +; GFX1032-NEXT: s_add_i32 s4, s4, s5 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 29 +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_add_i32 s3, s4, s3 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 30 +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: s_add_i32 s2, s3, s2 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 31 +; GFX1032-NEXT: s_cselect_b32 s0, s1, 0 +; GFX1032-NEXT: s_add_i32 s0, s2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_u32 v0, v3 +; GFX1032-NEXT: ds_add_u32 v0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB3_2: @@ -968,41 +6515,306 @@ ; ; GFX1164-LABEL: add_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-NEXT: v_readlane_b32 s66, v0, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX1164-NEXT: v_readlane_b32 s65, v0, 1 +; GFX1164-NEXT: v_readlane_b32 s64, v0, 2 +; GFX1164-NEXT: v_readlane_b32 s63, v0, 3 +; GFX1164-NEXT: v_readlane_b32 s62, v0, 4 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GFX1164-NEXT: v_readlane_b32 s61, v0, 5 +; GFX1164-NEXT: v_readlane_b32 s60, v0, 6 +; GFX1164-NEXT: v_readlane_b32 s59, v0, 7 +; GFX1164-NEXT: v_readlane_b32 s58, v0, 8 +; GFX1164-NEXT: v_readlane_b32 s57, v0, 9 +; GFX1164-NEXT: v_readlane_b32 s56, v0, 10 +; GFX1164-NEXT: v_readlane_b32 s55, v0, 11 +; GFX1164-NEXT: v_readlane_b32 s54, v0, 12 +; GFX1164-NEXT: v_readlane_b32 s53, v0, 13 +; GFX1164-NEXT: v_readlane_b32 s52, v0, 14 +; GFX1164-NEXT: v_readlane_b32 s51, v0, 15 +; GFX1164-NEXT: v_readlane_b32 s50, v0, 16 +; GFX1164-NEXT: v_readlane_b32 s49, v0, 17 +; GFX1164-NEXT: v_readlane_b32 s48, v0, 18 +; GFX1164-NEXT: v_readlane_b32 s47, v0, 19 +; GFX1164-NEXT: v_readlane_b32 s46, v0, 20 +; GFX1164-NEXT: v_readlane_b32 s45, v0, 21 +; GFX1164-NEXT: v_readlane_b32 s44, v0, 22 +; GFX1164-NEXT: v_readlane_b32 s43, v0, 23 +; GFX1164-NEXT: v_readlane_b32 s42, v0, 24 +; GFX1164-NEXT: v_readlane_b32 s41, v0, 25 +; GFX1164-NEXT: v_readlane_b32 s40, v0, 26 +; GFX1164-NEXT: v_readlane_b32 s39, v0, 27 +; GFX1164-NEXT: v_readlane_b32 s38, v0, 28 +; GFX1164-NEXT: v_readlane_b32 s37, v0, 29 +; GFX1164-NEXT: v_readlane_b32 s36, v0, 30 +; GFX1164-NEXT: v_readlane_b32 s35, v0, 31 +; GFX1164-NEXT: v_readlane_b32 s34, v0, 32 +; GFX1164-NEXT: v_readlane_b32 s33, v0, 33 +; GFX1164-NEXT: v_readlane_b32 s31, v0, 34 +; GFX1164-NEXT: v_readlane_b32 s30, v0, 35 +; GFX1164-NEXT: v_readlane_b32 s29, v0, 36 +; GFX1164-NEXT: v_readlane_b32 s28, v0, 37 +; GFX1164-NEXT: v_readlane_b32 s27, v0, 38 +; GFX1164-NEXT: v_readlane_b32 s26, v0, 39 +; GFX1164-NEXT: v_readlane_b32 s25, v0, 40 +; GFX1164-NEXT: v_readlane_b32 s24, v0, 41 +; GFX1164-NEXT: v_readlane_b32 s23, v0, 42 +; GFX1164-NEXT: v_readlane_b32 s22, v0, 43 +; GFX1164-NEXT: v_readlane_b32 s21, v0, 44 +; GFX1164-NEXT: v_readlane_b32 s20, v0, 45 +; GFX1164-NEXT: v_readlane_b32 s19, v0, 46 +; GFX1164-NEXT: v_readlane_b32 s18, v0, 47 +; GFX1164-NEXT: v_readlane_b32 s17, v0, 48 +; GFX1164-NEXT: v_readlane_b32 s16, v0, 49 +; GFX1164-NEXT: v_readlane_b32 s15, v0, 50 +; GFX1164-NEXT: v_readlane_b32 s14, v0, 51 +; GFX1164-NEXT: v_readlane_b32 s13, v0, 52 +; GFX1164-NEXT: v_readlane_b32 s12, v0, 53 +; GFX1164-NEXT: v_readlane_b32 s11, v0, 54 +; GFX1164-NEXT: v_readlane_b32 s10, v0, 55 +; GFX1164-NEXT: v_readlane_b32 s9, v0, 56 +; GFX1164-NEXT: v_readlane_b32 s8, v0, 57 +; GFX1164-NEXT: v_readlane_b32 s6, v0, 58 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 59 +; GFX1164-NEXT: v_readfirstlane_b32 s67, v1 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: v_readlane_b32 s4, v0, 61 +; GFX1164-NEXT: v_readlane_b32 s3, v0, 62 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 63 +; GFX1164-NEXT: s_mov_b64 s[68:69], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e64 s67, v1 ; GFX1164-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_cselect_b32 s66, s66, 0 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 1 +; GFX1164-NEXT: s_cselect_b32 s65, s65, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s65, s66, s65 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 2 +; GFX1164-NEXT: s_cselect_b32 s64, s64, 0 +; GFX1164-NEXT: s_add_i32 s64, s65, s64 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 3 +; GFX1164-NEXT: s_cselect_b32 s63, s63, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s63, s64, s63 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 4 +; GFX1164-NEXT: s_cselect_b32 s62, s62, 0 +; GFX1164-NEXT: s_add_i32 s62, s63, s62 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 5 +; GFX1164-NEXT: s_cselect_b32 s61, s61, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s61, s62, s61 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 6 +; GFX1164-NEXT: s_cselect_b32 s60, s60, 0 +; GFX1164-NEXT: s_add_i32 s60, s61, s60 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 7 +; GFX1164-NEXT: s_cselect_b32 s59, s59, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s59, s60, s59 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 8 +; GFX1164-NEXT: s_cselect_b32 s58, s58, 0 +; GFX1164-NEXT: s_add_i32 s58, s59, s58 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 9 +; GFX1164-NEXT: s_cselect_b32 s57, s57, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s57, s58, s57 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 10 +; GFX1164-NEXT: s_cselect_b32 s56, s56, 0 +; GFX1164-NEXT: s_add_i32 s56, s57, s56 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 11 +; GFX1164-NEXT: s_cselect_b32 s55, s55, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s55, s56, s55 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 12 +; GFX1164-NEXT: s_cselect_b32 s54, s54, 0 +; GFX1164-NEXT: s_add_i32 s54, s55, s54 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 13 +; GFX1164-NEXT: s_cselect_b32 s53, s53, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s53, s54, s53 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 14 +; GFX1164-NEXT: s_cselect_b32 s52, s52, 0 +; GFX1164-NEXT: s_add_i32 s52, s53, s52 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 15 +; GFX1164-NEXT: s_cselect_b32 s51, s51, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s51, s52, s51 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 16 +; GFX1164-NEXT: s_cselect_b32 s50, s50, 0 +; GFX1164-NEXT: s_add_i32 s50, s51, s50 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 17 +; GFX1164-NEXT: s_cselect_b32 s49, s49, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s49, s50, s49 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 18 +; GFX1164-NEXT: s_cselect_b32 s48, s48, 0 +; GFX1164-NEXT: s_add_i32 s48, s49, s48 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 19 +; GFX1164-NEXT: s_cselect_b32 s47, s47, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s47, s48, s47 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 20 +; GFX1164-NEXT: s_cselect_b32 s46, s46, 0 +; GFX1164-NEXT: s_add_i32 s46, s47, s46 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 21 +; GFX1164-NEXT: s_cselect_b32 s45, s45, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s45, s46, s45 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 22 +; GFX1164-NEXT: s_cselect_b32 s44, s44, 0 +; GFX1164-NEXT: s_add_i32 s44, s45, s44 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 23 +; GFX1164-NEXT: s_cselect_b32 s43, s43, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s43, s44, s43 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 24 +; GFX1164-NEXT: s_cselect_b32 s42, s42, 0 +; GFX1164-NEXT: s_add_i32 s42, s43, s42 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 25 +; GFX1164-NEXT: s_cselect_b32 s41, s41, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s41, s42, s41 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 26 +; GFX1164-NEXT: s_cselect_b32 s40, s40, 0 +; GFX1164-NEXT: s_add_i32 s40, s41, s40 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 27 +; GFX1164-NEXT: s_cselect_b32 s39, s39, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s39, s40, s39 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 28 +; GFX1164-NEXT: s_cselect_b32 s38, s38, 0 +; GFX1164-NEXT: s_add_i32 s38, s39, s38 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 29 +; GFX1164-NEXT: s_cselect_b32 s37, s37, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s37, s38, s37 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 30 +; GFX1164-NEXT: s_cselect_b32 s36, s36, 0 +; GFX1164-NEXT: s_add_i32 s36, s37, s36 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 31 +; GFX1164-NEXT: s_cselect_b32 s0, s35, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s36, s0 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 0 +; GFX1164-NEXT: s_cselect_b32 s34, s34, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s34 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 1 +; GFX1164-NEXT: s_cselect_b32 s33, s33, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s33 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 2 +; GFX1164-NEXT: s_cselect_b32 s31, s31, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s31 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 3 +; GFX1164-NEXT: s_cselect_b32 s30, s30, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s30 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 4 +; GFX1164-NEXT: s_cselect_b32 s29, s29, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s29 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 5 +; GFX1164-NEXT: s_cselect_b32 s28, s28, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s28 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 6 +; GFX1164-NEXT: s_cselect_b32 s27, s27, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s27 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 7 +; GFX1164-NEXT: s_cselect_b32 s26, s26, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s26 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 8 +; GFX1164-NEXT: s_cselect_b32 s25, s25, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s25 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 9 +; GFX1164-NEXT: s_cselect_b32 s24, s24, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s24 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 10 +; GFX1164-NEXT: s_cselect_b32 s23, s23, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s23 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 11 +; GFX1164-NEXT: s_cselect_b32 s22, s22, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s22 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 12 +; GFX1164-NEXT: s_cselect_b32 s21, s21, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s21 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 13 +; GFX1164-NEXT: s_cselect_b32 s20, s20, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s20 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 14 +; GFX1164-NEXT: s_cselect_b32 s19, s19, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s19 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 15 +; GFX1164-NEXT: s_cselect_b32 s18, s18, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s18 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 16 +; GFX1164-NEXT: s_cselect_b32 s17, s17, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s17 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 17 +; GFX1164-NEXT: s_cselect_b32 s16, s16, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s16 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 18 +; GFX1164-NEXT: s_cselect_b32 s15, s15, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s15 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 19 +; GFX1164-NEXT: s_cselect_b32 s14, s14, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s14 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 20 +; GFX1164-NEXT: s_cselect_b32 s13, s13, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s13 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 21 +; GFX1164-NEXT: s_cselect_b32 s12, s12, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s12 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 22 +; GFX1164-NEXT: s_cselect_b32 s11, s11, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s11 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 23 +; GFX1164-NEXT: s_cselect_b32 s10, s10, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s10 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 24 +; GFX1164-NEXT: s_cselect_b32 s9, s9, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s9 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 25 +; GFX1164-NEXT: s_cselect_b32 s8, s8, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s8 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 26 +; GFX1164-NEXT: s_cselect_b32 s6, s6, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s6 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 27 +; GFX1164-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s5 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 28 +; GFX1164-NEXT: s_cselect_b32 s5, s7, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s5 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 29 +; GFX1164-NEXT: s_cselect_b32 s4, s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s4 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 30 +; GFX1164-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s3 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 31 +; GFX1164-NEXT: s_cselect_b32 s1, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_u32 v0, v3 +; GFX1164-NEXT: ds_add_u32 v0, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB3_2: @@ -1010,33 +6822,160 @@ ; ; GFX1132-LABEL: add_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-NEXT: v_readlane_b32 s33, v0, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX1132-NEXT: v_readlane_b32 s31, v0, 1 +; GFX1132-NEXT: v_readlane_b32 s30, v0, 2 +; GFX1132-NEXT: v_readlane_b32 s29, v0, 3 +; GFX1132-NEXT: v_readlane_b32 s28, v0, 4 +; GFX1132-NEXT: v_readfirstlane_b32 s34, v1 +; GFX1132-NEXT: v_readlane_b32 s27, v0, 5 +; GFX1132-NEXT: v_readlane_b32 s26, v0, 6 +; GFX1132-NEXT: v_readlane_b32 s25, v0, 7 +; GFX1132-NEXT: v_readlane_b32 s24, v0, 8 +; GFX1132-NEXT: v_readlane_b32 s23, v0, 9 +; GFX1132-NEXT: v_readlane_b32 s22, v0, 10 +; GFX1132-NEXT: v_readlane_b32 s21, v0, 11 +; GFX1132-NEXT: v_readlane_b32 s20, v0, 12 +; GFX1132-NEXT: v_readlane_b32 s19, v0, 13 +; GFX1132-NEXT: v_readlane_b32 s18, v0, 14 +; GFX1132-NEXT: v_readlane_b32 s17, v0, 15 +; GFX1132-NEXT: v_readlane_b32 s16, v0, 16 +; GFX1132-NEXT: v_readlane_b32 s15, v0, 17 +; GFX1132-NEXT: v_readlane_b32 s14, v0, 18 +; GFX1132-NEXT: v_readlane_b32 s13, v0, 19 +; GFX1132-NEXT: v_readlane_b32 s12, v0, 20 +; GFX1132-NEXT: v_readlane_b32 s11, v0, 21 +; GFX1132-NEXT: v_readlane_b32 s10, v0, 22 +; GFX1132-NEXT: v_readlane_b32 s9, v0, 23 +; GFX1132-NEXT: v_readlane_b32 s8, v0, 24 +; GFX1132-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1132-NEXT: v_readlane_b32 s5, v0, 26 +; GFX1132-NEXT: v_readlane_b32 s4, v0, 27 +; GFX1132-NEXT: v_readlane_b32 s6, v0, 28 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 30 +; GFX1132-NEXT: v_readlane_b32 s1, v0, 31 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s34, v1 +; GFX1132-NEXT: s_and_saveexec_b32 s34, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1132-NEXT: s_cselect_b32 s33, s33, 0 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 1 +; GFX1132-NEXT: s_cselect_b32 s31, s31, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s31, s33, s31 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 2 +; GFX1132-NEXT: s_cselect_b32 s30, s30, 0 +; GFX1132-NEXT: s_add_i32 s30, s31, s30 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 3 +; GFX1132-NEXT: s_cselect_b32 s29, s29, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s29, s30, s29 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 4 +; GFX1132-NEXT: s_cselect_b32 s28, s28, 0 +; GFX1132-NEXT: s_add_i32 s28, s29, s28 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 5 +; GFX1132-NEXT: s_cselect_b32 s27, s27, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s27, s28, s27 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 6 +; GFX1132-NEXT: s_cselect_b32 s26, s26, 0 +; GFX1132-NEXT: s_add_i32 s26, s27, s26 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 7 +; GFX1132-NEXT: s_cselect_b32 s25, s25, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s25, s26, s25 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 8 +; GFX1132-NEXT: s_cselect_b32 s24, s24, 0 +; GFX1132-NEXT: s_add_i32 s24, s25, s24 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 9 +; GFX1132-NEXT: s_cselect_b32 s23, s23, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s23, s24, s23 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 10 +; GFX1132-NEXT: s_cselect_b32 s22, s22, 0 +; GFX1132-NEXT: s_add_i32 s22, s23, s22 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 11 +; GFX1132-NEXT: s_cselect_b32 s21, s21, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s21, s22, s21 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 12 +; GFX1132-NEXT: s_cselect_b32 s20, s20, 0 +; GFX1132-NEXT: s_add_i32 s20, s21, s20 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 13 +; GFX1132-NEXT: s_cselect_b32 s19, s19, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s19, s20, s19 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 14 +; GFX1132-NEXT: s_cselect_b32 s18, s18, 0 +; GFX1132-NEXT: s_add_i32 s18, s19, s18 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 15 +; GFX1132-NEXT: s_cselect_b32 s17, s17, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s17, s18, s17 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 16 +; GFX1132-NEXT: s_cselect_b32 s16, s16, 0 +; GFX1132-NEXT: s_add_i32 s16, s17, s16 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 17 +; GFX1132-NEXT: s_cselect_b32 s15, s15, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s15, s16, s15 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 18 +; GFX1132-NEXT: s_cselect_b32 s14, s14, 0 +; GFX1132-NEXT: s_add_i32 s14, s15, s14 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 19 +; GFX1132-NEXT: s_cselect_b32 s13, s13, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s13, s14, s13 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 20 +; GFX1132-NEXT: s_cselect_b32 s12, s12, 0 +; GFX1132-NEXT: s_add_i32 s12, s13, s12 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 21 +; GFX1132-NEXT: s_cselect_b32 s11, s11, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s11, s12, s11 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 22 +; GFX1132-NEXT: s_cselect_b32 s10, s10, 0 +; GFX1132-NEXT: s_add_i32 s10, s11, s10 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 23 +; GFX1132-NEXT: s_cselect_b32 s9, s9, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s9, s10, s9 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 24 +; GFX1132-NEXT: s_cselect_b32 s8, s8, 0 +; GFX1132-NEXT: s_add_i32 s8, s9, s8 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 25 +; GFX1132-NEXT: s_cselect_b32 s7, s7, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s7, s8, s7 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 26 +; GFX1132-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1132-NEXT: s_add_i32 s5, s7, s5 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 27 +; GFX1132-NEXT: s_cselect_b32 s4, s4, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s4, s5, s4 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 28 +; GFX1132-NEXT: s_cselect_b32 s5, s6, 0 +; GFX1132-NEXT: s_add_i32 s4, s4, s5 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 29 +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s3, s4, s3 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 30 +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: s_add_i32 s2, s3, s2 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 31 +; GFX1132-NEXT: s_cselect_b32 s0, s1, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s0, s2, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_u32 v0, v3 +; GFX1132-NEXT: ds_add_u32 v0, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB3_2: @@ -2140,269 +8079,4528 @@ ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_2 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB9_3 +; GFX8-NEXT: .LBB9_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB9_6 +; GFX8-NEXT: .LBB9_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB9_9 +; GFX8-NEXT: .LBB9_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB9_12 +; GFX8-NEXT: .LBB9_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB9_15 +; GFX8-NEXT: .LBB9_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB9_18 +; GFX8-NEXT: .LBB9_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB9_21 +; GFX8-NEXT: .LBB9_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB9_24 +; GFX8-NEXT: .LBB9_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB9_27 +; GFX8-NEXT: .LBB9_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB9_30 +; GFX8-NEXT: .LBB9_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB9_33 +; GFX8-NEXT: .LBB9_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB9_36 +; GFX8-NEXT: .LBB9_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB9_39 +; GFX8-NEXT: .LBB9_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB9_42 +; GFX8-NEXT: .LBB9_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB9_45 +; GFX8-NEXT: .LBB9_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB9_48 +; GFX8-NEXT: .LBB9_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB9_51 +; GFX8-NEXT: .LBB9_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB9_54 +; GFX8-NEXT: .LBB9_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB9_57 +; GFX8-NEXT: .LBB9_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB9_60 +; GFX8-NEXT: .LBB9_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB9_63 +; GFX8-NEXT: .LBB9_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB9_66 +; GFX8-NEXT: .LBB9_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB9_69 +; GFX8-NEXT: .LBB9_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB9_72 +; GFX8-NEXT: .LBB9_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB9_75 +; GFX8-NEXT: .LBB9_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB9_78 +; GFX8-NEXT: .LBB9_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB9_81 +; GFX8-NEXT: .LBB9_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB9_84 +; GFX8-NEXT: .LBB9_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB9_87 +; GFX8-NEXT: .LBB9_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB9_90 +; GFX8-NEXT: .LBB9_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB9_93 +; GFX8-NEXT: .LBB9_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB9_96 +; GFX8-NEXT: .LBB9_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB9_99 +; GFX8-NEXT: .LBB9_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB9_102 +; GFX8-NEXT: .LBB9_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB9_105 +; GFX8-NEXT: .LBB9_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB9_108 +; GFX8-NEXT: .LBB9_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB9_111 +; GFX8-NEXT: .LBB9_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB9_114 +; GFX8-NEXT: .LBB9_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB9_117 +; GFX8-NEXT: .LBB9_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB9_120 +; GFX8-NEXT: .LBB9_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB9_123 +; GFX8-NEXT: .LBB9_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB9_126 +; GFX8-NEXT: .LBB9_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB9_129 +; GFX8-NEXT: .LBB9_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB9_132 +; GFX8-NEXT: .LBB9_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB9_135 +; GFX8-NEXT: .LBB9_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB9_138 +; GFX8-NEXT: .LBB9_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB9_141 +; GFX8-NEXT: .LBB9_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB9_144 +; GFX8-NEXT: .LBB9_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB9_147 +; GFX8-NEXT: .LBB9_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB9_150 +; GFX8-NEXT: .LBB9_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB9_153 +; GFX8-NEXT: .LBB9_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB9_156 +; GFX8-NEXT: .LBB9_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB9_159 +; GFX8-NEXT: .LBB9_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB9_162 +; GFX8-NEXT: .LBB9_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB9_165 +; GFX8-NEXT: .LBB9_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB9_168 +; GFX8-NEXT: .LBB9_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB9_171 +; GFX8-NEXT: .LBB9_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB9_174 +; GFX8-NEXT: .LBB9_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB9_177 +; GFX8-NEXT: .LBB9_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB9_180 +; GFX8-NEXT: .LBB9_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB9_183 +; GFX8-NEXT: .LBB9_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB9_186 +; GFX8-NEXT: .LBB9_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB9_189 +; GFX8-NEXT: .LBB9_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB9_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB9_192 +; GFX8-NEXT: .LBB9_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB9_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB9_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_cbranch_execz .LBB9_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB9_2: +; GFX8-NEXT: .LBB9_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB9_3 +; GFX9-NEXT: .LBB9_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB9_6 +; GFX9-NEXT: .LBB9_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB9_9 +; GFX9-NEXT: .LBB9_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB9_12 +; GFX9-NEXT: .LBB9_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB9_15 +; GFX9-NEXT: .LBB9_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB9_18 +; GFX9-NEXT: .LBB9_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB9_21 +; GFX9-NEXT: .LBB9_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB9_24 +; GFX9-NEXT: .LBB9_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB9_27 +; GFX9-NEXT: .LBB9_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB9_30 +; GFX9-NEXT: .LBB9_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB9_33 +; GFX9-NEXT: .LBB9_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB9_36 +; GFX9-NEXT: .LBB9_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB9_39 +; GFX9-NEXT: .LBB9_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB9_42 +; GFX9-NEXT: .LBB9_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB9_45 +; GFX9-NEXT: .LBB9_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB9_48 +; GFX9-NEXT: .LBB9_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB9_51 +; GFX9-NEXT: .LBB9_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB9_54 +; GFX9-NEXT: .LBB9_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB9_57 +; GFX9-NEXT: .LBB9_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB9_60 +; GFX9-NEXT: .LBB9_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB9_63 +; GFX9-NEXT: .LBB9_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB9_66 +; GFX9-NEXT: .LBB9_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB9_69 +; GFX9-NEXT: .LBB9_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB9_72 +; GFX9-NEXT: .LBB9_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB9_75 +; GFX9-NEXT: .LBB9_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB9_78 +; GFX9-NEXT: .LBB9_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB9_81 +; GFX9-NEXT: .LBB9_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB9_84 +; GFX9-NEXT: .LBB9_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB9_87 +; GFX9-NEXT: .LBB9_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB9_90 +; GFX9-NEXT: .LBB9_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB9_93 +; GFX9-NEXT: .LBB9_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB9_96 +; GFX9-NEXT: .LBB9_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB9_99 +; GFX9-NEXT: .LBB9_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB9_102 +; GFX9-NEXT: .LBB9_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB9_105 +; GFX9-NEXT: .LBB9_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB9_108 +; GFX9-NEXT: .LBB9_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB9_111 +; GFX9-NEXT: .LBB9_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB9_114 +; GFX9-NEXT: .LBB9_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB9_117 +; GFX9-NEXT: .LBB9_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB9_120 +; GFX9-NEXT: .LBB9_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB9_123 +; GFX9-NEXT: .LBB9_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB9_126 +; GFX9-NEXT: .LBB9_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB9_129 +; GFX9-NEXT: .LBB9_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB9_132 +; GFX9-NEXT: .LBB9_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB9_135 +; GFX9-NEXT: .LBB9_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB9_138 +; GFX9-NEXT: .LBB9_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB9_141 +; GFX9-NEXT: .LBB9_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB9_144 +; GFX9-NEXT: .LBB9_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB9_147 +; GFX9-NEXT: .LBB9_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB9_150 +; GFX9-NEXT: .LBB9_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB9_153 +; GFX9-NEXT: .LBB9_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB9_156 +; GFX9-NEXT: .LBB9_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB9_159 +; GFX9-NEXT: .LBB9_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB9_162 +; GFX9-NEXT: .LBB9_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB9_165 +; GFX9-NEXT: .LBB9_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB9_168 +; GFX9-NEXT: .LBB9_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB9_171 +; GFX9-NEXT: .LBB9_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB9_174 +; GFX9-NEXT: .LBB9_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB9_177 +; GFX9-NEXT: .LBB9_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB9_180 +; GFX9-NEXT: .LBB9_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB9_183 +; GFX9-NEXT: .LBB9_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB9_186 +; GFX9-NEXT: .LBB9_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB9_189 +; GFX9-NEXT: .LBB9_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB9_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB9_192 +; GFX9-NEXT: .LBB9_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB9_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB9_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_cbranch_execz .LBB9_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB9_2: +; GFX9-NEXT: .LBB9_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1064-NEXT: s_branch .LBB9_3 +; GFX1064-NEXT: .LBB9_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB9_6 +; GFX1064-NEXT: .LBB9_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB9_9 +; GFX1064-NEXT: .LBB9_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB9_12 +; GFX1064-NEXT: .LBB9_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB9_15 +; GFX1064-NEXT: .LBB9_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB9_18 +; GFX1064-NEXT: .LBB9_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB9_21 +; GFX1064-NEXT: .LBB9_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB9_24 +; GFX1064-NEXT: .LBB9_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB9_27 +; GFX1064-NEXT: .LBB9_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB9_30 +; GFX1064-NEXT: .LBB9_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB9_33 +; GFX1064-NEXT: .LBB9_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB9_36 +; GFX1064-NEXT: .LBB9_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB9_39 +; GFX1064-NEXT: .LBB9_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB9_42 +; GFX1064-NEXT: .LBB9_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB9_45 +; GFX1064-NEXT: .LBB9_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB9_48 +; GFX1064-NEXT: .LBB9_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB9_51 +; GFX1064-NEXT: .LBB9_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB9_54 +; GFX1064-NEXT: .LBB9_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB9_57 +; GFX1064-NEXT: .LBB9_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB9_60 +; GFX1064-NEXT: .LBB9_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB9_63 +; GFX1064-NEXT: .LBB9_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB9_66 +; GFX1064-NEXT: .LBB9_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB9_69 +; GFX1064-NEXT: .LBB9_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB9_72 +; GFX1064-NEXT: .LBB9_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB9_75 +; GFX1064-NEXT: .LBB9_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB9_78 +; GFX1064-NEXT: .LBB9_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB9_81 +; GFX1064-NEXT: .LBB9_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB9_84 +; GFX1064-NEXT: .LBB9_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_add_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB9_87 +; GFX1064-NEXT: .LBB9_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB9_90 +; GFX1064-NEXT: .LBB9_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_add_i32 s4, s6, s2 +; GFX1064-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB9_93 +; GFX1064-NEXT: .LBB9_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0 +; GFX1064-NEXT: s_add_i32 s6, s4, s2 +; GFX1064-NEXT: v_readlane_b32 s4, v0, 31 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 31 +; GFX1064-NEXT: s_branch .LBB9_96 +; GFX1064-NEXT: .LBB9_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s3, s4, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB9_99 +; GFX1064-NEXT: .LBB9_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB9_102 +; GFX1064-NEXT: .LBB9_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB9_105 +; GFX1064-NEXT: .LBB9_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB9_108 +; GFX1064-NEXT: .LBB9_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB9_111 +; GFX1064-NEXT: .LBB9_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB9_114 +; GFX1064-NEXT: .LBB9_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB9_117 +; GFX1064-NEXT: .LBB9_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB9_120 +; GFX1064-NEXT: .LBB9_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB9_123 +; GFX1064-NEXT: .LBB9_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB9_126 +; GFX1064-NEXT: .LBB9_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB9_129 +; GFX1064-NEXT: .LBB9_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB9_132 +; GFX1064-NEXT: .LBB9_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB9_135 +; GFX1064-NEXT: .LBB9_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB9_138 +; GFX1064-NEXT: .LBB9_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB9_141 +; GFX1064-NEXT: .LBB9_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB9_144 +; GFX1064-NEXT: .LBB9_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB9_147 +; GFX1064-NEXT: .LBB9_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB9_150 +; GFX1064-NEXT: .LBB9_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB9_153 +; GFX1064-NEXT: .LBB9_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB9_156 +; GFX1064-NEXT: .LBB9_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB9_159 +; GFX1064-NEXT: .LBB9_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB9_162 +; GFX1064-NEXT: .LBB9_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB9_165 +; GFX1064-NEXT: .LBB9_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB9_168 +; GFX1064-NEXT: .LBB9_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB9_171 +; GFX1064-NEXT: .LBB9_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB9_174 +; GFX1064-NEXT: .LBB9_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB9_177 +; GFX1064-NEXT: .LBB9_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB9_180 +; GFX1064-NEXT: .LBB9_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB9_183 +; GFX1064-NEXT: .LBB9_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_add_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1064-NEXT: s_branch .LBB9_186 +; GFX1064-NEXT: .LBB9_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1064-NEXT: s_branch .LBB9_189 +; GFX1064-NEXT: .LBB9_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_add_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB9_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1064-NEXT: s_branch .LBB9_192 +; GFX1064-NEXT: .LBB9_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB9_192: +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB9_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB9_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: s_add_i32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB9_2: +; GFX1064-NEXT: .LBB9_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1032-NEXT: s_branch .LBB9_3 +; GFX1032-NEXT: .LBB9_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB9_6 +; GFX1032-NEXT: .LBB9_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB9_9 +; GFX1032-NEXT: .LBB9_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB9_12 +; GFX1032-NEXT: .LBB9_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB9_15 +; GFX1032-NEXT: .LBB9_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB9_18 +; GFX1032-NEXT: .LBB9_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB9_21 +; GFX1032-NEXT: .LBB9_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB9_24 +; GFX1032-NEXT: .LBB9_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB9_27 +; GFX1032-NEXT: .LBB9_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB9_30 +; GFX1032-NEXT: .LBB9_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB9_33 +; GFX1032-NEXT: .LBB9_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB9_36 +; GFX1032-NEXT: .LBB9_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB9_39 +; GFX1032-NEXT: .LBB9_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB9_42 +; GFX1032-NEXT: .LBB9_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB9_45 +; GFX1032-NEXT: .LBB9_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB9_48 +; GFX1032-NEXT: .LBB9_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB9_51 +; GFX1032-NEXT: .LBB9_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB9_54 +; GFX1032-NEXT: .LBB9_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB9_57 +; GFX1032-NEXT: .LBB9_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB9_60 +; GFX1032-NEXT: .LBB9_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB9_63 +; GFX1032-NEXT: .LBB9_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB9_66 +; GFX1032-NEXT: .LBB9_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB9_69 +; GFX1032-NEXT: .LBB9_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB9_72 +; GFX1032-NEXT: .LBB9_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB9_75 +; GFX1032-NEXT: .LBB9_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB9_78 +; GFX1032-NEXT: .LBB9_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB9_81 +; GFX1032-NEXT: .LBB9_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB9_84 +; GFX1032-NEXT: .LBB9_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB9_87 +; GFX1032-NEXT: .LBB9_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB9_90 +; GFX1032-NEXT: .LBB9_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1032-NEXT: s_branch .LBB9_93 +; GFX1032-NEXT: .LBB9_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_93: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB9_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1032-NEXT: s_branch .LBB9_96 +; GFX1032-NEXT: .LBB9_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB9_96: +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB9_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB9_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_add_i32 s2, s2, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB9_2: +; GFX1032-NEXT: .LBB9_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1164-NEXT: s_branch .LBB9_3 +; GFX1164-NEXT: .LBB9_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB9_6 +; GFX1164-NEXT: .LBB9_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB9_9 +; GFX1164-NEXT: .LBB9_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB9_12 +; GFX1164-NEXT: .LBB9_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB9_15 +; GFX1164-NEXT: .LBB9_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB9_18 +; GFX1164-NEXT: .LBB9_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB9_21 +; GFX1164-NEXT: .LBB9_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB9_24 +; GFX1164-NEXT: .LBB9_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB9_27 +; GFX1164-NEXT: .LBB9_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB9_30 +; GFX1164-NEXT: .LBB9_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB9_33 +; GFX1164-NEXT: .LBB9_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB9_36 +; GFX1164-NEXT: .LBB9_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB9_39 +; GFX1164-NEXT: .LBB9_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB9_42 +; GFX1164-NEXT: .LBB9_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB9_45 +; GFX1164-NEXT: .LBB9_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB9_48 +; GFX1164-NEXT: .LBB9_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB9_51 +; GFX1164-NEXT: .LBB9_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB9_54 +; GFX1164-NEXT: .LBB9_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB9_57 +; GFX1164-NEXT: .LBB9_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB9_60 +; GFX1164-NEXT: .LBB9_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB9_63 +; GFX1164-NEXT: .LBB9_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB9_66 +; GFX1164-NEXT: .LBB9_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB9_69 +; GFX1164-NEXT: .LBB9_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB9_72 +; GFX1164-NEXT: .LBB9_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB9_75 +; GFX1164-NEXT: .LBB9_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB9_78 +; GFX1164-NEXT: .LBB9_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB9_81 +; GFX1164-NEXT: .LBB9_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB9_84 +; GFX1164-NEXT: .LBB9_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_add_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB9_87 +; GFX1164-NEXT: .LBB9_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB9_90 +; GFX1164-NEXT: .LBB9_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_add_i32 s4, s6, s2 +; GFX1164-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1164-NEXT: s_mov_b32 s7, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB9_93 +; GFX1164-NEXT: .LBB9_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: s_add_i32 s6, s4, s2 +; GFX1164-NEXT: v_readlane_b32 s4, v0, 31 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 31 +; GFX1164-NEXT: s_branch .LBB9_96 +; GFX1164-NEXT: .LBB9_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s3, s4, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB9_99 +; GFX1164-NEXT: .LBB9_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB9_102 +; GFX1164-NEXT: .LBB9_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB9_105 +; GFX1164-NEXT: .LBB9_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB9_108 +; GFX1164-NEXT: .LBB9_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB9_111 +; GFX1164-NEXT: .LBB9_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB9_114 +; GFX1164-NEXT: .LBB9_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB9_117 +; GFX1164-NEXT: .LBB9_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB9_120 +; GFX1164-NEXT: .LBB9_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB9_123 +; GFX1164-NEXT: .LBB9_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB9_126 +; GFX1164-NEXT: .LBB9_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB9_129 +; GFX1164-NEXT: .LBB9_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB9_132 +; GFX1164-NEXT: .LBB9_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB9_135 +; GFX1164-NEXT: .LBB9_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB9_138 +; GFX1164-NEXT: .LBB9_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB9_141 +; GFX1164-NEXT: .LBB9_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB9_144 +; GFX1164-NEXT: .LBB9_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB9_147 +; GFX1164-NEXT: .LBB9_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB9_150 +; GFX1164-NEXT: .LBB9_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB9_153 +; GFX1164-NEXT: .LBB9_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB9_156 +; GFX1164-NEXT: .LBB9_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB9_159 +; GFX1164-NEXT: .LBB9_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB9_162 +; GFX1164-NEXT: .LBB9_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB9_165 +; GFX1164-NEXT: .LBB9_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB9_168 +; GFX1164-NEXT: .LBB9_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB9_171 +; GFX1164-NEXT: .LBB9_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB9_174 +; GFX1164-NEXT: .LBB9_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB9_177 +; GFX1164-NEXT: .LBB9_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB9_180 +; GFX1164-NEXT: .LBB9_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB9_183 +; GFX1164-NEXT: .LBB9_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_add_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1164-NEXT: s_branch .LBB9_186 +; GFX1164-NEXT: .LBB9_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1164-NEXT: s_branch .LBB9_189 +; GFX1164-NEXT: .LBB9_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_add_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB9_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1164-NEXT: s_branch .LBB9_192 +; GFX1164-NEXT: .LBB9_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB9_192: +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB9_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB9_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_add_i32 s4, s6, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB9_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB9_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2410,53 +12608,510 @@ ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1132-NEXT: s_branch .LBB9_3 +; GFX1132-NEXT: .LBB9_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB9_6 +; GFX1132-NEXT: .LBB9_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB9_9 +; GFX1132-NEXT: .LBB9_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB9_12 +; GFX1132-NEXT: .LBB9_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB9_15 +; GFX1132-NEXT: .LBB9_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB9_18 +; GFX1132-NEXT: .LBB9_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB9_21 +; GFX1132-NEXT: .LBB9_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB9_24 +; GFX1132-NEXT: .LBB9_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB9_27 +; GFX1132-NEXT: .LBB9_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB9_30 +; GFX1132-NEXT: .LBB9_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB9_33 +; GFX1132-NEXT: .LBB9_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB9_36 +; GFX1132-NEXT: .LBB9_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB9_39 +; GFX1132-NEXT: .LBB9_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB9_42 +; GFX1132-NEXT: .LBB9_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB9_45 +; GFX1132-NEXT: .LBB9_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB9_48 +; GFX1132-NEXT: .LBB9_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB9_51 +; GFX1132-NEXT: .LBB9_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB9_54 +; GFX1132-NEXT: .LBB9_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB9_57 +; GFX1132-NEXT: .LBB9_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB9_60 +; GFX1132-NEXT: .LBB9_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB9_63 +; GFX1132-NEXT: .LBB9_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB9_66 +; GFX1132-NEXT: .LBB9_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB9_69 +; GFX1132-NEXT: .LBB9_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB9_72 +; GFX1132-NEXT: .LBB9_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB9_75 +; GFX1132-NEXT: .LBB9_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB9_78 +; GFX1132-NEXT: .LBB9_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB9_81 +; GFX1132-NEXT: .LBB9_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB9_84 +; GFX1132-NEXT: .LBB9_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB9_87 +; GFX1132-NEXT: .LBB9_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB9_90 +; GFX1132-NEXT: .LBB9_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1132-NEXT: s_branch .LBB9_93 +; GFX1132-NEXT: .LBB9_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_93: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB9_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1132-NEXT: s_branch .LBB9_96 +; GFX1132-NEXT: .LBB9_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB9_96: +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB9_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB9_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_add_i32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_sub_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB9_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB9_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2480,106 +13135,821 @@ ; ; GFX8-LABEL: sub_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s2, v1, 63 -; GFX8-NEXT: s_mov_b64 exec, s[0:1] -; GFX8-NEXT: s_mov_b32 s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_mov_b64 s[0:1], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GFX8-NEXT: v_readfirstlane_b32 s67, v1 +; GFX8-NEXT: v_readlane_b32 s66, v0, 0 +; GFX8-NEXT: v_readlane_b32 s65, v0, 1 +; GFX8-NEXT: v_readlane_b32 s64, v0, 2 +; GFX8-NEXT: v_readlane_b32 s63, v0, 3 +; GFX8-NEXT: v_readlane_b32 s62, v0, 4 +; GFX8-NEXT: v_readlane_b32 s61, v0, 5 +; GFX8-NEXT: v_readlane_b32 s60, v0, 6 +; GFX8-NEXT: v_readlane_b32 s59, v0, 7 +; GFX8-NEXT: v_readlane_b32 s58, v0, 8 +; GFX8-NEXT: v_readlane_b32 s57, v0, 9 +; GFX8-NEXT: v_readlane_b32 s56, v0, 10 +; GFX8-NEXT: v_readlane_b32 s55, v0, 11 +; GFX8-NEXT: v_readlane_b32 s54, v0, 12 +; GFX8-NEXT: v_readlane_b32 s53, v0, 13 +; GFX8-NEXT: v_readlane_b32 s52, v0, 14 +; GFX8-NEXT: v_readlane_b32 s51, v0, 15 +; GFX8-NEXT: v_readlane_b32 s50, v0, 16 +; GFX8-NEXT: v_readlane_b32 s49, v0, 17 +; GFX8-NEXT: v_readlane_b32 s48, v0, 18 +; GFX8-NEXT: v_readlane_b32 s47, v0, 19 +; GFX8-NEXT: v_readlane_b32 s46, v0, 20 +; GFX8-NEXT: v_readlane_b32 s45, v0, 21 +; GFX8-NEXT: v_readlane_b32 s44, v0, 22 +; GFX8-NEXT: v_readlane_b32 s43, v0, 23 +; GFX8-NEXT: v_readlane_b32 s42, v0, 24 +; GFX8-NEXT: v_readlane_b32 s41, v0, 25 +; GFX8-NEXT: v_readlane_b32 s40, v0, 26 +; GFX8-NEXT: v_readlane_b32 s39, v0, 27 +; GFX8-NEXT: v_readlane_b32 s38, v0, 28 +; GFX8-NEXT: v_readlane_b32 s37, v0, 29 +; GFX8-NEXT: v_readlane_b32 s36, v0, 30 +; GFX8-NEXT: v_readlane_b32 s35, v0, 31 +; GFX8-NEXT: v_readlane_b32 s34, v0, 32 +; GFX8-NEXT: v_readlane_b32 s33, v0, 33 +; GFX8-NEXT: v_readlane_b32 s31, v0, 34 +; GFX8-NEXT: v_readlane_b32 s30, v0, 35 +; GFX8-NEXT: v_readlane_b32 s29, v0, 36 +; GFX8-NEXT: v_readlane_b32 s28, v0, 37 +; GFX8-NEXT: v_readlane_b32 s27, v0, 38 +; GFX8-NEXT: v_readlane_b32 s26, v0, 39 +; GFX8-NEXT: v_readlane_b32 s25, v0, 40 +; GFX8-NEXT: v_readlane_b32 s24, v0, 41 +; GFX8-NEXT: v_readlane_b32 s23, v0, 42 +; GFX8-NEXT: v_readlane_b32 s22, v0, 43 +; GFX8-NEXT: v_readlane_b32 s21, v0, 44 +; GFX8-NEXT: v_readlane_b32 s20, v0, 45 +; GFX8-NEXT: v_readlane_b32 s19, v0, 46 +; GFX8-NEXT: v_readlane_b32 s18, v0, 47 +; GFX8-NEXT: v_readlane_b32 s17, v0, 48 +; GFX8-NEXT: v_readlane_b32 s16, v0, 49 +; GFX8-NEXT: v_readlane_b32 s15, v0, 50 +; GFX8-NEXT: v_readlane_b32 s14, v0, 51 +; GFX8-NEXT: v_readlane_b32 s13, v0, 52 +; GFX8-NEXT: v_readlane_b32 s12, v0, 53 +; GFX8-NEXT: v_readlane_b32 s11, v0, 54 +; GFX8-NEXT: v_readlane_b32 s10, v0, 55 +; GFX8-NEXT: v_readlane_b32 s9, v0, 56 +; GFX8-NEXT: v_readlane_b32 s8, v0, 57 +; GFX8-NEXT: v_readlane_b32 s7, v0, 58 +; GFX8-NEXT: v_readlane_b32 s6, v0, 59 +; GFX8-NEXT: v_readlane_b32 s5, v0, 60 +; GFX8-NEXT: v_readlane_b32 s4, v0, 61 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: v_readlane_b32 s2, v0, 63 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s67, v1 +; GFX8-NEXT: s_and_saveexec_b64 s[68:69], vcc ; GFX8-NEXT: s_cbranch_execz .LBB10_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_bitcmp1_b32 s0, 0 +; GFX8-NEXT: s_cselect_b32 s66, s66, 0 +; GFX8-NEXT: s_bitcmp1_b32 s0, 1 +; GFX8-NEXT: s_cselect_b32 s65, s65, 0 +; GFX8-NEXT: s_add_i32 s65, s66, s65 +; GFX8-NEXT: s_bitcmp1_b32 s0, 2 +; GFX8-NEXT: s_cselect_b32 s64, s64, 0 +; GFX8-NEXT: s_add_i32 s64, s65, s64 +; GFX8-NEXT: s_bitcmp1_b32 s0, 3 +; GFX8-NEXT: s_cselect_b32 s63, s63, 0 +; GFX8-NEXT: s_add_i32 s63, s64, s63 +; GFX8-NEXT: s_bitcmp1_b32 s0, 4 +; GFX8-NEXT: s_cselect_b32 s62, s62, 0 +; GFX8-NEXT: s_add_i32 s62, s63, s62 +; GFX8-NEXT: s_bitcmp1_b32 s0, 5 +; GFX8-NEXT: s_cselect_b32 s61, s61, 0 +; GFX8-NEXT: s_add_i32 s61, s62, s61 +; GFX8-NEXT: s_bitcmp1_b32 s0, 6 +; GFX8-NEXT: s_cselect_b32 s60, s60, 0 +; GFX8-NEXT: s_add_i32 s60, s61, s60 +; GFX8-NEXT: s_bitcmp1_b32 s0, 7 +; GFX8-NEXT: s_cselect_b32 s59, s59, 0 +; GFX8-NEXT: s_add_i32 s59, s60, s59 +; GFX8-NEXT: s_bitcmp1_b32 s0, 8 +; GFX8-NEXT: s_cselect_b32 s58, s58, 0 +; GFX8-NEXT: s_add_i32 s58, s59, s58 +; GFX8-NEXT: s_bitcmp1_b32 s0, 9 +; GFX8-NEXT: s_cselect_b32 s57, s57, 0 +; GFX8-NEXT: s_add_i32 s57, s58, s57 +; GFX8-NEXT: s_bitcmp1_b32 s0, 10 +; GFX8-NEXT: s_cselect_b32 s56, s56, 0 +; GFX8-NEXT: s_add_i32 s56, s57, s56 +; GFX8-NEXT: s_bitcmp1_b32 s0, 11 +; GFX8-NEXT: s_cselect_b32 s55, s55, 0 +; GFX8-NEXT: s_add_i32 s55, s56, s55 +; GFX8-NEXT: s_bitcmp1_b32 s0, 12 +; GFX8-NEXT: s_cselect_b32 s54, s54, 0 +; GFX8-NEXT: s_add_i32 s54, s55, s54 +; GFX8-NEXT: s_bitcmp1_b32 s0, 13 +; GFX8-NEXT: s_cselect_b32 s53, s53, 0 +; GFX8-NEXT: s_add_i32 s53, s54, s53 +; GFX8-NEXT: s_bitcmp1_b32 s0, 14 +; GFX8-NEXT: s_cselect_b32 s52, s52, 0 +; GFX8-NEXT: s_add_i32 s52, s53, s52 +; GFX8-NEXT: s_bitcmp1_b32 s0, 15 +; GFX8-NEXT: s_cselect_b32 s51, s51, 0 +; GFX8-NEXT: s_add_i32 s51, s52, s51 +; GFX8-NEXT: s_bitcmp1_b32 s0, 16 +; GFX8-NEXT: s_cselect_b32 s50, s50, 0 +; GFX8-NEXT: s_add_i32 s50, s51, s50 +; GFX8-NEXT: s_bitcmp1_b32 s0, 17 +; GFX8-NEXT: s_cselect_b32 s49, s49, 0 +; GFX8-NEXT: s_add_i32 s49, s50, s49 +; GFX8-NEXT: s_bitcmp1_b32 s0, 18 +; GFX8-NEXT: s_cselect_b32 s48, s48, 0 +; GFX8-NEXT: s_add_i32 s48, s49, s48 +; GFX8-NEXT: s_bitcmp1_b32 s0, 19 +; GFX8-NEXT: s_cselect_b32 s47, s47, 0 +; GFX8-NEXT: s_add_i32 s47, s48, s47 +; GFX8-NEXT: s_bitcmp1_b32 s0, 20 +; GFX8-NEXT: s_cselect_b32 s46, s46, 0 +; GFX8-NEXT: s_add_i32 s46, s47, s46 +; GFX8-NEXT: s_bitcmp1_b32 s0, 21 +; GFX8-NEXT: s_cselect_b32 s45, s45, 0 +; GFX8-NEXT: s_add_i32 s45, s46, s45 +; GFX8-NEXT: s_bitcmp1_b32 s0, 22 +; GFX8-NEXT: s_cselect_b32 s44, s44, 0 +; GFX8-NEXT: s_add_i32 s44, s45, s44 +; GFX8-NEXT: s_bitcmp1_b32 s0, 23 +; GFX8-NEXT: s_cselect_b32 s43, s43, 0 +; GFX8-NEXT: s_add_i32 s43, s44, s43 +; GFX8-NEXT: s_bitcmp1_b32 s0, 24 +; GFX8-NEXT: s_cselect_b32 s42, s42, 0 +; GFX8-NEXT: s_add_i32 s42, s43, s42 +; GFX8-NEXT: s_bitcmp1_b32 s0, 25 +; GFX8-NEXT: s_cselect_b32 s41, s41, 0 +; GFX8-NEXT: s_add_i32 s41, s42, s41 +; GFX8-NEXT: s_bitcmp1_b32 s0, 26 +; GFX8-NEXT: s_cselect_b32 s40, s40, 0 +; GFX8-NEXT: s_add_i32 s40, s41, s40 +; GFX8-NEXT: s_bitcmp1_b32 s0, 27 +; GFX8-NEXT: s_cselect_b32 s39, s39, 0 +; GFX8-NEXT: s_add_i32 s39, s40, s39 +; GFX8-NEXT: s_bitcmp1_b32 s0, 28 +; GFX8-NEXT: s_cselect_b32 s38, s38, 0 +; GFX8-NEXT: s_add_i32 s38, s39, s38 +; GFX8-NEXT: s_bitcmp1_b32 s0, 29 +; GFX8-NEXT: s_cselect_b32 s37, s37, 0 +; GFX8-NEXT: s_add_i32 s37, s38, s37 +; GFX8-NEXT: s_bitcmp1_b32 s0, 30 +; GFX8-NEXT: s_cselect_b32 s36, s36, 0 +; GFX8-NEXT: s_add_i32 s36, s37, s36 +; GFX8-NEXT: s_bitcmp1_b32 s0, 31 +; GFX8-NEXT: s_cselect_b32 s0, s35, 0 +; GFX8-NEXT: s_add_i32 s0, s36, s0 +; GFX8-NEXT: s_bitcmp1_b32 s1, 0 +; GFX8-NEXT: s_cselect_b32 s34, s34, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s34 +; GFX8-NEXT: s_bitcmp1_b32 s1, 1 +; GFX8-NEXT: s_cselect_b32 s33, s33, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s33 +; GFX8-NEXT: s_bitcmp1_b32 s1, 2 +; GFX8-NEXT: s_cselect_b32 s31, s31, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s31 +; GFX8-NEXT: s_bitcmp1_b32 s1, 3 +; GFX8-NEXT: s_cselect_b32 s30, s30, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s30 +; GFX8-NEXT: s_bitcmp1_b32 s1, 4 +; GFX8-NEXT: s_cselect_b32 s29, s29, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s29 +; GFX8-NEXT: s_bitcmp1_b32 s1, 5 +; GFX8-NEXT: s_cselect_b32 s28, s28, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s28 +; GFX8-NEXT: s_bitcmp1_b32 s1, 6 +; GFX8-NEXT: s_cselect_b32 s27, s27, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s27 +; GFX8-NEXT: s_bitcmp1_b32 s1, 7 +; GFX8-NEXT: s_cselect_b32 s26, s26, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s26 +; GFX8-NEXT: s_bitcmp1_b32 s1, 8 +; GFX8-NEXT: s_cselect_b32 s25, s25, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s25 +; GFX8-NEXT: s_bitcmp1_b32 s1, 9 +; GFX8-NEXT: s_cselect_b32 s24, s24, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s24 +; GFX8-NEXT: s_bitcmp1_b32 s1, 10 +; GFX8-NEXT: s_cselect_b32 s23, s23, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s23 +; GFX8-NEXT: s_bitcmp1_b32 s1, 11 +; GFX8-NEXT: s_cselect_b32 s22, s22, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s22 +; GFX8-NEXT: s_bitcmp1_b32 s1, 12 +; GFX8-NEXT: s_cselect_b32 s21, s21, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s21 +; GFX8-NEXT: s_bitcmp1_b32 s1, 13 +; GFX8-NEXT: s_cselect_b32 s20, s20, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s20 +; GFX8-NEXT: s_bitcmp1_b32 s1, 14 +; GFX8-NEXT: s_cselect_b32 s19, s19, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s19 +; GFX8-NEXT: s_bitcmp1_b32 s1, 15 +; GFX8-NEXT: s_cselect_b32 s18, s18, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s18 +; GFX8-NEXT: s_bitcmp1_b32 s1, 16 +; GFX8-NEXT: s_cselect_b32 s17, s17, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s17 +; GFX8-NEXT: s_bitcmp1_b32 s1, 17 +; GFX8-NEXT: s_cselect_b32 s16, s16, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s16 +; GFX8-NEXT: s_bitcmp1_b32 s1, 18 +; GFX8-NEXT: s_cselect_b32 s15, s15, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s15 +; GFX8-NEXT: s_bitcmp1_b32 s1, 19 +; GFX8-NEXT: s_cselect_b32 s14, s14, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s14 +; GFX8-NEXT: s_bitcmp1_b32 s1, 20 +; GFX8-NEXT: s_cselect_b32 s13, s13, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s13 +; GFX8-NEXT: s_bitcmp1_b32 s1, 21 +; GFX8-NEXT: s_cselect_b32 s12, s12, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s12 +; GFX8-NEXT: s_bitcmp1_b32 s1, 22 +; GFX8-NEXT: s_cselect_b32 s11, s11, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s11 +; GFX8-NEXT: s_bitcmp1_b32 s1, 23 +; GFX8-NEXT: s_cselect_b32 s10, s10, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s10 +; GFX8-NEXT: s_bitcmp1_b32 s1, 24 +; GFX8-NEXT: s_cselect_b32 s9, s9, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s9 +; GFX8-NEXT: s_bitcmp1_b32 s1, 25 +; GFX8-NEXT: s_cselect_b32 s8, s8, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s8 +; GFX8-NEXT: s_bitcmp1_b32 s1, 26 +; GFX8-NEXT: s_cselect_b32 s7, s7, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s7 +; GFX8-NEXT: s_bitcmp1_b32 s1, 27 +; GFX8-NEXT: s_cselect_b32 s6, s6, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s6 +; GFX8-NEXT: s_bitcmp1_b32 s1, 28 +; GFX8-NEXT: s_cselect_b32 s5, s5, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s5 +; GFX8-NEXT: s_bitcmp1_b32 s1, 29 +; GFX8-NEXT: s_cselect_b32 s4, s4, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s4 +; GFX8-NEXT: s_bitcmp1_b32 s1, 30 +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s3 +; GFX8-NEXT: s_bitcmp1_b32 s1, 31 +; GFX8-NEXT: s_cselect_b32 s1, s2, 0 +; GFX8-NEXT: s_add_i32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_u32 v2, v0 +; GFX8-NEXT: ds_sub_u32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB10_2: ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s2, v1, 63 -; GFX9-NEXT: s_mov_b64 exec, s[0:1] -; GFX9-NEXT: s_mov_b32 s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s67, v1 +; GFX9-NEXT: v_readlane_b32 s66, v0, 0 +; GFX9-NEXT: v_readlane_b32 s65, v0, 1 +; GFX9-NEXT: v_readlane_b32 s64, v0, 2 +; GFX9-NEXT: v_readlane_b32 s63, v0, 3 +; GFX9-NEXT: v_readlane_b32 s62, v0, 4 +; GFX9-NEXT: v_readlane_b32 s61, v0, 5 +; GFX9-NEXT: v_readlane_b32 s60, v0, 6 +; GFX9-NEXT: v_readlane_b32 s59, v0, 7 +; GFX9-NEXT: v_readlane_b32 s58, v0, 8 +; GFX9-NEXT: v_readlane_b32 s57, v0, 9 +; GFX9-NEXT: v_readlane_b32 s56, v0, 10 +; GFX9-NEXT: v_readlane_b32 s55, v0, 11 +; GFX9-NEXT: v_readlane_b32 s54, v0, 12 +; GFX9-NEXT: v_readlane_b32 s53, v0, 13 +; GFX9-NEXT: v_readlane_b32 s52, v0, 14 +; GFX9-NEXT: v_readlane_b32 s51, v0, 15 +; GFX9-NEXT: v_readlane_b32 s50, v0, 16 +; GFX9-NEXT: v_readlane_b32 s49, v0, 17 +; GFX9-NEXT: v_readlane_b32 s48, v0, 18 +; GFX9-NEXT: v_readlane_b32 s47, v0, 19 +; GFX9-NEXT: v_readlane_b32 s46, v0, 20 +; GFX9-NEXT: v_readlane_b32 s45, v0, 21 +; GFX9-NEXT: v_readlane_b32 s44, v0, 22 +; GFX9-NEXT: v_readlane_b32 s43, v0, 23 +; GFX9-NEXT: v_readlane_b32 s42, v0, 24 +; GFX9-NEXT: v_readlane_b32 s41, v0, 25 +; GFX9-NEXT: v_readlane_b32 s40, v0, 26 +; GFX9-NEXT: v_readlane_b32 s39, v0, 27 +; GFX9-NEXT: v_readlane_b32 s38, v0, 28 +; GFX9-NEXT: v_readlane_b32 s37, v0, 29 +; GFX9-NEXT: v_readlane_b32 s36, v0, 30 +; GFX9-NEXT: v_readlane_b32 s35, v0, 31 +; GFX9-NEXT: v_readlane_b32 s34, v0, 32 +; GFX9-NEXT: v_readlane_b32 s33, v0, 33 +; GFX9-NEXT: v_readlane_b32 s31, v0, 34 +; GFX9-NEXT: v_readlane_b32 s30, v0, 35 +; GFX9-NEXT: v_readlane_b32 s29, v0, 36 +; GFX9-NEXT: v_readlane_b32 s28, v0, 37 +; GFX9-NEXT: v_readlane_b32 s27, v0, 38 +; GFX9-NEXT: v_readlane_b32 s26, v0, 39 +; GFX9-NEXT: v_readlane_b32 s25, v0, 40 +; GFX9-NEXT: v_readlane_b32 s24, v0, 41 +; GFX9-NEXT: v_readlane_b32 s23, v0, 42 +; GFX9-NEXT: v_readlane_b32 s22, v0, 43 +; GFX9-NEXT: v_readlane_b32 s21, v0, 44 +; GFX9-NEXT: v_readlane_b32 s20, v0, 45 +; GFX9-NEXT: v_readlane_b32 s19, v0, 46 +; GFX9-NEXT: v_readlane_b32 s18, v0, 47 +; GFX9-NEXT: v_readlane_b32 s17, v0, 48 +; GFX9-NEXT: v_readlane_b32 s16, v0, 49 +; GFX9-NEXT: v_readlane_b32 s15, v0, 50 +; GFX9-NEXT: v_readlane_b32 s14, v0, 51 +; GFX9-NEXT: v_readlane_b32 s13, v0, 52 +; GFX9-NEXT: v_readlane_b32 s12, v0, 53 +; GFX9-NEXT: v_readlane_b32 s11, v0, 54 +; GFX9-NEXT: v_readlane_b32 s10, v0, 55 +; GFX9-NEXT: v_readlane_b32 s9, v0, 56 +; GFX9-NEXT: v_readlane_b32 s8, v0, 57 +; GFX9-NEXT: v_readlane_b32 s7, v0, 58 +; GFX9-NEXT: v_readlane_b32 s6, v0, 59 +; GFX9-NEXT: v_readlane_b32 s5, v0, 60 +; GFX9-NEXT: v_readlane_b32 s4, v0, 61 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: v_readlane_b32 s2, v0, 63 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s67, v1 +; GFX9-NEXT: s_and_saveexec_b64 s[68:69], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_bitcmp1_b32 s0, 0 +; GFX9-NEXT: s_cselect_b32 s66, s66, 0 +; GFX9-NEXT: s_bitcmp1_b32 s0, 1 +; GFX9-NEXT: s_cselect_b32 s65, s65, 0 +; GFX9-NEXT: s_add_i32 s65, s66, s65 +; GFX9-NEXT: s_bitcmp1_b32 s0, 2 +; GFX9-NEXT: s_cselect_b32 s64, s64, 0 +; GFX9-NEXT: s_add_i32 s64, s65, s64 +; GFX9-NEXT: s_bitcmp1_b32 s0, 3 +; GFX9-NEXT: s_cselect_b32 s63, s63, 0 +; GFX9-NEXT: s_add_i32 s63, s64, s63 +; GFX9-NEXT: s_bitcmp1_b32 s0, 4 +; GFX9-NEXT: s_cselect_b32 s62, s62, 0 +; GFX9-NEXT: s_add_i32 s62, s63, s62 +; GFX9-NEXT: s_bitcmp1_b32 s0, 5 +; GFX9-NEXT: s_cselect_b32 s61, s61, 0 +; GFX9-NEXT: s_add_i32 s61, s62, s61 +; GFX9-NEXT: s_bitcmp1_b32 s0, 6 +; GFX9-NEXT: s_cselect_b32 s60, s60, 0 +; GFX9-NEXT: s_add_i32 s60, s61, s60 +; GFX9-NEXT: s_bitcmp1_b32 s0, 7 +; GFX9-NEXT: s_cselect_b32 s59, s59, 0 +; GFX9-NEXT: s_add_i32 s59, s60, s59 +; GFX9-NEXT: s_bitcmp1_b32 s0, 8 +; GFX9-NEXT: s_cselect_b32 s58, s58, 0 +; GFX9-NEXT: s_add_i32 s58, s59, s58 +; GFX9-NEXT: s_bitcmp1_b32 s0, 9 +; GFX9-NEXT: s_cselect_b32 s57, s57, 0 +; GFX9-NEXT: s_add_i32 s57, s58, s57 +; GFX9-NEXT: s_bitcmp1_b32 s0, 10 +; GFX9-NEXT: s_cselect_b32 s56, s56, 0 +; GFX9-NEXT: s_add_i32 s56, s57, s56 +; GFX9-NEXT: s_bitcmp1_b32 s0, 11 +; GFX9-NEXT: s_cselect_b32 s55, s55, 0 +; GFX9-NEXT: s_add_i32 s55, s56, s55 +; GFX9-NEXT: s_bitcmp1_b32 s0, 12 +; GFX9-NEXT: s_cselect_b32 s54, s54, 0 +; GFX9-NEXT: s_add_i32 s54, s55, s54 +; GFX9-NEXT: s_bitcmp1_b32 s0, 13 +; GFX9-NEXT: s_cselect_b32 s53, s53, 0 +; GFX9-NEXT: s_add_i32 s53, s54, s53 +; GFX9-NEXT: s_bitcmp1_b32 s0, 14 +; GFX9-NEXT: s_cselect_b32 s52, s52, 0 +; GFX9-NEXT: s_add_i32 s52, s53, s52 +; GFX9-NEXT: s_bitcmp1_b32 s0, 15 +; GFX9-NEXT: s_cselect_b32 s51, s51, 0 +; GFX9-NEXT: s_add_i32 s51, s52, s51 +; GFX9-NEXT: s_bitcmp1_b32 s0, 16 +; GFX9-NEXT: s_cselect_b32 s50, s50, 0 +; GFX9-NEXT: s_add_i32 s50, s51, s50 +; GFX9-NEXT: s_bitcmp1_b32 s0, 17 +; GFX9-NEXT: s_cselect_b32 s49, s49, 0 +; GFX9-NEXT: s_add_i32 s49, s50, s49 +; GFX9-NEXT: s_bitcmp1_b32 s0, 18 +; GFX9-NEXT: s_cselect_b32 s48, s48, 0 +; GFX9-NEXT: s_add_i32 s48, s49, s48 +; GFX9-NEXT: s_bitcmp1_b32 s0, 19 +; GFX9-NEXT: s_cselect_b32 s47, s47, 0 +; GFX9-NEXT: s_add_i32 s47, s48, s47 +; GFX9-NEXT: s_bitcmp1_b32 s0, 20 +; GFX9-NEXT: s_cselect_b32 s46, s46, 0 +; GFX9-NEXT: s_add_i32 s46, s47, s46 +; GFX9-NEXT: s_bitcmp1_b32 s0, 21 +; GFX9-NEXT: s_cselect_b32 s45, s45, 0 +; GFX9-NEXT: s_add_i32 s45, s46, s45 +; GFX9-NEXT: s_bitcmp1_b32 s0, 22 +; GFX9-NEXT: s_cselect_b32 s44, s44, 0 +; GFX9-NEXT: s_add_i32 s44, s45, s44 +; GFX9-NEXT: s_bitcmp1_b32 s0, 23 +; GFX9-NEXT: s_cselect_b32 s43, s43, 0 +; GFX9-NEXT: s_add_i32 s43, s44, s43 +; GFX9-NEXT: s_bitcmp1_b32 s0, 24 +; GFX9-NEXT: s_cselect_b32 s42, s42, 0 +; GFX9-NEXT: s_add_i32 s42, s43, s42 +; GFX9-NEXT: s_bitcmp1_b32 s0, 25 +; GFX9-NEXT: s_cselect_b32 s41, s41, 0 +; GFX9-NEXT: s_add_i32 s41, s42, s41 +; GFX9-NEXT: s_bitcmp1_b32 s0, 26 +; GFX9-NEXT: s_cselect_b32 s40, s40, 0 +; GFX9-NEXT: s_add_i32 s40, s41, s40 +; GFX9-NEXT: s_bitcmp1_b32 s0, 27 +; GFX9-NEXT: s_cselect_b32 s39, s39, 0 +; GFX9-NEXT: s_add_i32 s39, s40, s39 +; GFX9-NEXT: s_bitcmp1_b32 s0, 28 +; GFX9-NEXT: s_cselect_b32 s38, s38, 0 +; GFX9-NEXT: s_add_i32 s38, s39, s38 +; GFX9-NEXT: s_bitcmp1_b32 s0, 29 +; GFX9-NEXT: s_cselect_b32 s37, s37, 0 +; GFX9-NEXT: s_add_i32 s37, s38, s37 +; GFX9-NEXT: s_bitcmp1_b32 s0, 30 +; GFX9-NEXT: s_cselect_b32 s36, s36, 0 +; GFX9-NEXT: s_add_i32 s36, s37, s36 +; GFX9-NEXT: s_bitcmp1_b32 s0, 31 +; GFX9-NEXT: s_cselect_b32 s0, s35, 0 +; GFX9-NEXT: s_add_i32 s0, s36, s0 +; GFX9-NEXT: s_bitcmp1_b32 s1, 0 +; GFX9-NEXT: s_cselect_b32 s34, s34, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s34 +; GFX9-NEXT: s_bitcmp1_b32 s1, 1 +; GFX9-NEXT: s_cselect_b32 s33, s33, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s33 +; GFX9-NEXT: s_bitcmp1_b32 s1, 2 +; GFX9-NEXT: s_cselect_b32 s31, s31, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s31 +; GFX9-NEXT: s_bitcmp1_b32 s1, 3 +; GFX9-NEXT: s_cselect_b32 s30, s30, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s30 +; GFX9-NEXT: s_bitcmp1_b32 s1, 4 +; GFX9-NEXT: s_cselect_b32 s29, s29, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s29 +; GFX9-NEXT: s_bitcmp1_b32 s1, 5 +; GFX9-NEXT: s_cselect_b32 s28, s28, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s28 +; GFX9-NEXT: s_bitcmp1_b32 s1, 6 +; GFX9-NEXT: s_cselect_b32 s27, s27, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s27 +; GFX9-NEXT: s_bitcmp1_b32 s1, 7 +; GFX9-NEXT: s_cselect_b32 s26, s26, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s26 +; GFX9-NEXT: s_bitcmp1_b32 s1, 8 +; GFX9-NEXT: s_cselect_b32 s25, s25, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s25 +; GFX9-NEXT: s_bitcmp1_b32 s1, 9 +; GFX9-NEXT: s_cselect_b32 s24, s24, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s24 +; GFX9-NEXT: s_bitcmp1_b32 s1, 10 +; GFX9-NEXT: s_cselect_b32 s23, s23, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s23 +; GFX9-NEXT: s_bitcmp1_b32 s1, 11 +; GFX9-NEXT: s_cselect_b32 s22, s22, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s22 +; GFX9-NEXT: s_bitcmp1_b32 s1, 12 +; GFX9-NEXT: s_cselect_b32 s21, s21, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s21 +; GFX9-NEXT: s_bitcmp1_b32 s1, 13 +; GFX9-NEXT: s_cselect_b32 s20, s20, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s20 +; GFX9-NEXT: s_bitcmp1_b32 s1, 14 +; GFX9-NEXT: s_cselect_b32 s19, s19, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s19 +; GFX9-NEXT: s_bitcmp1_b32 s1, 15 +; GFX9-NEXT: s_cselect_b32 s18, s18, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s18 +; GFX9-NEXT: s_bitcmp1_b32 s1, 16 +; GFX9-NEXT: s_cselect_b32 s17, s17, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s17 +; GFX9-NEXT: s_bitcmp1_b32 s1, 17 +; GFX9-NEXT: s_cselect_b32 s16, s16, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s16 +; GFX9-NEXT: s_bitcmp1_b32 s1, 18 +; GFX9-NEXT: s_cselect_b32 s15, s15, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s15 +; GFX9-NEXT: s_bitcmp1_b32 s1, 19 +; GFX9-NEXT: s_cselect_b32 s14, s14, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s14 +; GFX9-NEXT: s_bitcmp1_b32 s1, 20 +; GFX9-NEXT: s_cselect_b32 s13, s13, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s13 +; GFX9-NEXT: s_bitcmp1_b32 s1, 21 +; GFX9-NEXT: s_cselect_b32 s12, s12, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s12 +; GFX9-NEXT: s_bitcmp1_b32 s1, 22 +; GFX9-NEXT: s_cselect_b32 s11, s11, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s11 +; GFX9-NEXT: s_bitcmp1_b32 s1, 23 +; GFX9-NEXT: s_cselect_b32 s10, s10, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s10 +; GFX9-NEXT: s_bitcmp1_b32 s1, 24 +; GFX9-NEXT: s_cselect_b32 s9, s9, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s9 +; GFX9-NEXT: s_bitcmp1_b32 s1, 25 +; GFX9-NEXT: s_cselect_b32 s8, s8, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s8 +; GFX9-NEXT: s_bitcmp1_b32 s1, 26 +; GFX9-NEXT: s_cselect_b32 s7, s7, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s7 +; GFX9-NEXT: s_bitcmp1_b32 s1, 27 +; GFX9-NEXT: s_cselect_b32 s6, s6, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s6 +; GFX9-NEXT: s_bitcmp1_b32 s1, 28 +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s5 +; GFX9-NEXT: s_bitcmp1_b32 s1, 29 +; GFX9-NEXT: s_cselect_b32 s4, s4, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_bitcmp1_b32 s1, 30 +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s3 +; GFX9-NEXT: s_bitcmp1_b32 s1, 31 +; GFX9-NEXT: s_cselect_b32 s1, s2, 0 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_u32 v2, v0 +; GFX9-NEXT: ds_sub_u32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB10_2: ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i32_varying_nouse: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1064-NEXT: v_readlane_b32 s2, v1, 0 -; GFX1064-NEXT: v_readlane_b32 s3, v1, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[0:1] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: s_add_i32 s0, s2, s3 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_readlane_b32 s66, v0, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX1064-NEXT: v_readlane_b32 s65, v0, 1 +; GFX1064-NEXT: v_readlane_b32 s64, v0, 2 +; GFX1064-NEXT: v_readlane_b32 s63, v0, 3 +; GFX1064-NEXT: v_readlane_b32 s62, v0, 4 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GFX1064-NEXT: v_readlane_b32 s61, v0, 5 +; GFX1064-NEXT: v_readlane_b32 s60, v0, 6 +; GFX1064-NEXT: v_readlane_b32 s59, v0, 7 +; GFX1064-NEXT: v_readlane_b32 s58, v0, 8 +; GFX1064-NEXT: v_readfirstlane_b32 s67, v1 +; GFX1064-NEXT: v_readlane_b32 s57, v0, 9 +; GFX1064-NEXT: v_readlane_b32 s56, v0, 10 +; GFX1064-NEXT: v_readlane_b32 s55, v0, 11 +; GFX1064-NEXT: v_readlane_b32 s54, v0, 12 +; GFX1064-NEXT: v_readlane_b32 s53, v0, 13 +; GFX1064-NEXT: v_readlane_b32 s52, v0, 14 +; GFX1064-NEXT: v_readlane_b32 s51, v0, 15 +; GFX1064-NEXT: v_readlane_b32 s50, v0, 16 +; GFX1064-NEXT: v_readlane_b32 s49, v0, 17 +; GFX1064-NEXT: v_readlane_b32 s48, v0, 18 +; GFX1064-NEXT: v_readlane_b32 s47, v0, 19 +; GFX1064-NEXT: v_readlane_b32 s46, v0, 20 +; GFX1064-NEXT: v_readlane_b32 s45, v0, 21 +; GFX1064-NEXT: v_readlane_b32 s44, v0, 22 +; GFX1064-NEXT: v_readlane_b32 s43, v0, 23 +; GFX1064-NEXT: v_readlane_b32 s42, v0, 24 +; GFX1064-NEXT: v_readlane_b32 s41, v0, 25 +; GFX1064-NEXT: v_readlane_b32 s40, v0, 26 +; GFX1064-NEXT: v_readlane_b32 s39, v0, 27 +; GFX1064-NEXT: v_readlane_b32 s38, v0, 28 +; GFX1064-NEXT: v_readlane_b32 s37, v0, 29 +; GFX1064-NEXT: v_readlane_b32 s36, v0, 30 +; GFX1064-NEXT: v_readlane_b32 s35, v0, 31 +; GFX1064-NEXT: v_readlane_b32 s34, v0, 32 +; GFX1064-NEXT: v_readlane_b32 s33, v0, 33 +; GFX1064-NEXT: v_readlane_b32 s31, v0, 34 +; GFX1064-NEXT: v_readlane_b32 s30, v0, 35 +; GFX1064-NEXT: v_readlane_b32 s29, v0, 36 +; GFX1064-NEXT: v_readlane_b32 s28, v0, 37 +; GFX1064-NEXT: v_readlane_b32 s27, v0, 38 +; GFX1064-NEXT: v_readlane_b32 s26, v0, 39 +; GFX1064-NEXT: v_readlane_b32 s25, v0, 40 +; GFX1064-NEXT: v_readlane_b32 s24, v0, 41 +; GFX1064-NEXT: v_readlane_b32 s23, v0, 42 +; GFX1064-NEXT: v_readlane_b32 s22, v0, 43 +; GFX1064-NEXT: v_readlane_b32 s21, v0, 44 +; GFX1064-NEXT: v_readlane_b32 s20, v0, 45 +; GFX1064-NEXT: v_readlane_b32 s19, v0, 46 +; GFX1064-NEXT: v_readlane_b32 s18, v0, 47 +; GFX1064-NEXT: v_readlane_b32 s17, v0, 48 +; GFX1064-NEXT: v_readlane_b32 s16, v0, 49 +; GFX1064-NEXT: v_readlane_b32 s15, v0, 50 +; GFX1064-NEXT: v_readlane_b32 s14, v0, 51 +; GFX1064-NEXT: v_readlane_b32 s13, v0, 52 +; GFX1064-NEXT: v_readlane_b32 s12, v0, 53 +; GFX1064-NEXT: v_readlane_b32 s11, v0, 54 +; GFX1064-NEXT: v_readlane_b32 s10, v0, 55 +; GFX1064-NEXT: v_readlane_b32 s9, v0, 56 +; GFX1064-NEXT: v_readlane_b32 s8, v0, 57 +; GFX1064-NEXT: v_readlane_b32 s6, v0, 58 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 59 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: v_readlane_b32 s4, v0, 61 +; GFX1064-NEXT: v_readlane_b32 s3, v0, 62 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 63 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s67, v1 +; GFX1064-NEXT: s_and_saveexec_b64 s[68:69], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB10_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: s_cselect_b32 s66, s66, 0 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 1 +; GFX1064-NEXT: s_cselect_b32 s65, s65, 0 +; GFX1064-NEXT: s_add_i32 s65, s66, s65 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 2 +; GFX1064-NEXT: s_cselect_b32 s64, s64, 0 +; GFX1064-NEXT: s_add_i32 s64, s65, s64 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 3 +; GFX1064-NEXT: s_cselect_b32 s63, s63, 0 +; GFX1064-NEXT: s_add_i32 s63, s64, s63 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 4 +; GFX1064-NEXT: s_cselect_b32 s62, s62, 0 +; GFX1064-NEXT: s_add_i32 s62, s63, s62 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 5 +; GFX1064-NEXT: s_cselect_b32 s61, s61, 0 +; GFX1064-NEXT: s_add_i32 s61, s62, s61 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 6 +; GFX1064-NEXT: s_cselect_b32 s60, s60, 0 +; GFX1064-NEXT: s_add_i32 s60, s61, s60 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 7 +; GFX1064-NEXT: s_cselect_b32 s59, s59, 0 +; GFX1064-NEXT: s_add_i32 s59, s60, s59 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 8 +; GFX1064-NEXT: s_cselect_b32 s58, s58, 0 +; GFX1064-NEXT: s_add_i32 s58, s59, s58 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 9 +; GFX1064-NEXT: s_cselect_b32 s57, s57, 0 +; GFX1064-NEXT: s_add_i32 s57, s58, s57 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 10 +; GFX1064-NEXT: s_cselect_b32 s56, s56, 0 +; GFX1064-NEXT: s_add_i32 s56, s57, s56 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 11 +; GFX1064-NEXT: s_cselect_b32 s55, s55, 0 +; GFX1064-NEXT: s_add_i32 s55, s56, s55 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 12 +; GFX1064-NEXT: s_cselect_b32 s54, s54, 0 +; GFX1064-NEXT: s_add_i32 s54, s55, s54 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 13 +; GFX1064-NEXT: s_cselect_b32 s53, s53, 0 +; GFX1064-NEXT: s_add_i32 s53, s54, s53 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 14 +; GFX1064-NEXT: s_cselect_b32 s52, s52, 0 +; GFX1064-NEXT: s_add_i32 s52, s53, s52 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 15 +; GFX1064-NEXT: s_cselect_b32 s51, s51, 0 +; GFX1064-NEXT: s_add_i32 s51, s52, s51 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 16 +; GFX1064-NEXT: s_cselect_b32 s50, s50, 0 +; GFX1064-NEXT: s_add_i32 s50, s51, s50 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 17 +; GFX1064-NEXT: s_cselect_b32 s49, s49, 0 +; GFX1064-NEXT: s_add_i32 s49, s50, s49 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 18 +; GFX1064-NEXT: s_cselect_b32 s48, s48, 0 +; GFX1064-NEXT: s_add_i32 s48, s49, s48 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 19 +; GFX1064-NEXT: s_cselect_b32 s47, s47, 0 +; GFX1064-NEXT: s_add_i32 s47, s48, s47 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 20 +; GFX1064-NEXT: s_cselect_b32 s46, s46, 0 +; GFX1064-NEXT: s_add_i32 s46, s47, s46 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 21 +; GFX1064-NEXT: s_cselect_b32 s45, s45, 0 +; GFX1064-NEXT: s_add_i32 s45, s46, s45 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 22 +; GFX1064-NEXT: s_cselect_b32 s44, s44, 0 +; GFX1064-NEXT: s_add_i32 s44, s45, s44 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 23 +; GFX1064-NEXT: s_cselect_b32 s43, s43, 0 +; GFX1064-NEXT: s_add_i32 s43, s44, s43 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 24 +; GFX1064-NEXT: s_cselect_b32 s42, s42, 0 +; GFX1064-NEXT: s_add_i32 s42, s43, s42 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 25 +; GFX1064-NEXT: s_cselect_b32 s41, s41, 0 +; GFX1064-NEXT: s_add_i32 s41, s42, s41 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 26 +; GFX1064-NEXT: s_cselect_b32 s40, s40, 0 +; GFX1064-NEXT: s_add_i32 s40, s41, s40 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 27 +; GFX1064-NEXT: s_cselect_b32 s39, s39, 0 +; GFX1064-NEXT: s_add_i32 s39, s40, s39 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 28 +; GFX1064-NEXT: s_cselect_b32 s38, s38, 0 +; GFX1064-NEXT: s_add_i32 s38, s39, s38 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 29 +; GFX1064-NEXT: s_cselect_b32 s37, s37, 0 +; GFX1064-NEXT: s_add_i32 s37, s38, s37 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 30 +; GFX1064-NEXT: s_cselect_b32 s36, s36, 0 +; GFX1064-NEXT: s_add_i32 s36, s37, s36 +; GFX1064-NEXT: s_bitcmp1_b32 s0, 31 +; GFX1064-NEXT: s_cselect_b32 s0, s35, 0 +; GFX1064-NEXT: s_add_i32 s0, s36, s0 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 0 +; GFX1064-NEXT: s_cselect_b32 s34, s34, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s34 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 1 +; GFX1064-NEXT: s_cselect_b32 s33, s33, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s33 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 2 +; GFX1064-NEXT: s_cselect_b32 s31, s31, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s31 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 3 +; GFX1064-NEXT: s_cselect_b32 s30, s30, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s30 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 4 +; GFX1064-NEXT: s_cselect_b32 s29, s29, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s29 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 5 +; GFX1064-NEXT: s_cselect_b32 s28, s28, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s28 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 6 +; GFX1064-NEXT: s_cselect_b32 s27, s27, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s27 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 7 +; GFX1064-NEXT: s_cselect_b32 s26, s26, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s26 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 8 +; GFX1064-NEXT: s_cselect_b32 s25, s25, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s25 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 9 +; GFX1064-NEXT: s_cselect_b32 s24, s24, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s24 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 10 +; GFX1064-NEXT: s_cselect_b32 s23, s23, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s23 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 11 +; GFX1064-NEXT: s_cselect_b32 s22, s22, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s22 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 12 +; GFX1064-NEXT: s_cselect_b32 s21, s21, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s21 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 13 +; GFX1064-NEXT: s_cselect_b32 s20, s20, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s20 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 14 +; GFX1064-NEXT: s_cselect_b32 s19, s19, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s19 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 15 +; GFX1064-NEXT: s_cselect_b32 s18, s18, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s18 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 16 +; GFX1064-NEXT: s_cselect_b32 s17, s17, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s17 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 17 +; GFX1064-NEXT: s_cselect_b32 s16, s16, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s16 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 18 +; GFX1064-NEXT: s_cselect_b32 s15, s15, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s15 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 19 +; GFX1064-NEXT: s_cselect_b32 s14, s14, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s14 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 20 +; GFX1064-NEXT: s_cselect_b32 s13, s13, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s13 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 21 +; GFX1064-NEXT: s_cselect_b32 s12, s12, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s12 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 22 +; GFX1064-NEXT: s_cselect_b32 s11, s11, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s11 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 23 +; GFX1064-NEXT: s_cselect_b32 s10, s10, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s10 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 24 +; GFX1064-NEXT: s_cselect_b32 s9, s9, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s9 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 25 +; GFX1064-NEXT: s_cselect_b32 s8, s8, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s8 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 26 +; GFX1064-NEXT: s_cselect_b32 s6, s6, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s6 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 27 +; GFX1064-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s5 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 28 +; GFX1064-NEXT: s_cselect_b32 s5, s7, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s5 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 29 +; GFX1064-NEXT: s_cselect_b32 s4, s4, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s4 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 30 +; GFX1064-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s3 +; GFX1064-NEXT: s_bitcmp1_b32 s1, 31 +; GFX1064-NEXT: s_cselect_b32 s1, s2, 0 +; GFX1064-NEXT: s_add_i32 s0, s0, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_u32 v0, v3 +; GFX1064-NEXT: ds_sub_u32 v0, v1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB10_2: @@ -2587,29 +13957,145 @@ ; ; GFX1032-LABEL: sub_i32_varying_nouse: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v3, v1 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: v_readlane_b32 s33, v0, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX1032-NEXT: v_readlane_b32 s31, v0, 1 +; GFX1032-NEXT: v_readlane_b32 s30, v0, 2 +; GFX1032-NEXT: v_readlane_b32 s29, v0, 3 +; GFX1032-NEXT: v_readlane_b32 s28, v0, 4 +; GFX1032-NEXT: v_readfirstlane_b32 s34, v1 +; GFX1032-NEXT: v_readlane_b32 s27, v0, 5 +; GFX1032-NEXT: v_readlane_b32 s26, v0, 6 +; GFX1032-NEXT: v_readlane_b32 s25, v0, 7 +; GFX1032-NEXT: v_readlane_b32 s24, v0, 8 +; GFX1032-NEXT: v_readlane_b32 s23, v0, 9 +; GFX1032-NEXT: v_readlane_b32 s22, v0, 10 +; GFX1032-NEXT: v_readlane_b32 s21, v0, 11 +; GFX1032-NEXT: v_readlane_b32 s20, v0, 12 +; GFX1032-NEXT: v_readlane_b32 s19, v0, 13 +; GFX1032-NEXT: v_readlane_b32 s18, v0, 14 +; GFX1032-NEXT: v_readlane_b32 s17, v0, 15 +; GFX1032-NEXT: v_readlane_b32 s16, v0, 16 +; GFX1032-NEXT: v_readlane_b32 s15, v0, 17 +; GFX1032-NEXT: v_readlane_b32 s14, v0, 18 +; GFX1032-NEXT: v_readlane_b32 s13, v0, 19 +; GFX1032-NEXT: v_readlane_b32 s12, v0, 20 +; GFX1032-NEXT: v_readlane_b32 s11, v0, 21 +; GFX1032-NEXT: v_readlane_b32 s10, v0, 22 +; GFX1032-NEXT: v_readlane_b32 s9, v0, 23 +; GFX1032-NEXT: v_readlane_b32 s8, v0, 24 +; GFX1032-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1032-NEXT: v_readlane_b32 s5, v0, 26 +; GFX1032-NEXT: v_readlane_b32 s4, v0, 27 +; GFX1032-NEXT: v_readlane_b32 s6, v0, 28 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 30 +; GFX1032-NEXT: v_readlane_b32 s1, v0, 31 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s34, v1 +; GFX1032-NEXT: s_and_saveexec_b32 s34, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_2 ; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_cselect_b32 s33, s33, 0 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 1 +; GFX1032-NEXT: s_cselect_b32 s31, s31, 0 +; GFX1032-NEXT: s_add_i32 s31, s33, s31 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 2 +; GFX1032-NEXT: s_cselect_b32 s30, s30, 0 +; GFX1032-NEXT: s_add_i32 s30, s31, s30 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 3 +; GFX1032-NEXT: s_cselect_b32 s29, s29, 0 +; GFX1032-NEXT: s_add_i32 s29, s30, s29 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 4 +; GFX1032-NEXT: s_cselect_b32 s28, s28, 0 +; GFX1032-NEXT: s_add_i32 s28, s29, s28 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 5 +; GFX1032-NEXT: s_cselect_b32 s27, s27, 0 +; GFX1032-NEXT: s_add_i32 s27, s28, s27 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 6 +; GFX1032-NEXT: s_cselect_b32 s26, s26, 0 +; GFX1032-NEXT: s_add_i32 s26, s27, s26 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 7 +; GFX1032-NEXT: s_cselect_b32 s25, s25, 0 +; GFX1032-NEXT: s_add_i32 s25, s26, s25 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 8 +; GFX1032-NEXT: s_cselect_b32 s24, s24, 0 +; GFX1032-NEXT: s_add_i32 s24, s25, s24 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 9 +; GFX1032-NEXT: s_cselect_b32 s23, s23, 0 +; GFX1032-NEXT: s_add_i32 s23, s24, s23 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 10 +; GFX1032-NEXT: s_cselect_b32 s22, s22, 0 +; GFX1032-NEXT: s_add_i32 s22, s23, s22 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 11 +; GFX1032-NEXT: s_cselect_b32 s21, s21, 0 +; GFX1032-NEXT: s_add_i32 s21, s22, s21 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 12 +; GFX1032-NEXT: s_cselect_b32 s20, s20, 0 +; GFX1032-NEXT: s_add_i32 s20, s21, s20 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 13 +; GFX1032-NEXT: s_cselect_b32 s19, s19, 0 +; GFX1032-NEXT: s_add_i32 s19, s20, s19 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 14 +; GFX1032-NEXT: s_cselect_b32 s18, s18, 0 +; GFX1032-NEXT: s_add_i32 s18, s19, s18 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 15 +; GFX1032-NEXT: s_cselect_b32 s17, s17, 0 +; GFX1032-NEXT: s_add_i32 s17, s18, s17 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 16 +; GFX1032-NEXT: s_cselect_b32 s16, s16, 0 +; GFX1032-NEXT: s_add_i32 s16, s17, s16 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 17 +; GFX1032-NEXT: s_cselect_b32 s15, s15, 0 +; GFX1032-NEXT: s_add_i32 s15, s16, s15 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 18 +; GFX1032-NEXT: s_cselect_b32 s14, s14, 0 +; GFX1032-NEXT: s_add_i32 s14, s15, s14 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 19 +; GFX1032-NEXT: s_cselect_b32 s13, s13, 0 +; GFX1032-NEXT: s_add_i32 s13, s14, s13 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 20 +; GFX1032-NEXT: s_cselect_b32 s12, s12, 0 +; GFX1032-NEXT: s_add_i32 s12, s13, s12 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 21 +; GFX1032-NEXT: s_cselect_b32 s11, s11, 0 +; GFX1032-NEXT: s_add_i32 s11, s12, s11 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 22 +; GFX1032-NEXT: s_cselect_b32 s10, s10, 0 +; GFX1032-NEXT: s_add_i32 s10, s11, s10 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 23 +; GFX1032-NEXT: s_cselect_b32 s9, s9, 0 +; GFX1032-NEXT: s_add_i32 s9, s10, s9 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 24 +; GFX1032-NEXT: s_cselect_b32 s8, s8, 0 +; GFX1032-NEXT: s_add_i32 s8, s9, s8 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 25 +; GFX1032-NEXT: s_cselect_b32 s7, s7, 0 +; GFX1032-NEXT: s_add_i32 s7, s8, s7 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 26 +; GFX1032-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1032-NEXT: s_add_i32 s5, s7, s5 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 27 +; GFX1032-NEXT: s_cselect_b32 s4, s4, 0 +; GFX1032-NEXT: s_add_i32 s4, s5, s4 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 28 +; GFX1032-NEXT: s_cselect_b32 s5, s6, 0 +; GFX1032-NEXT: s_add_i32 s4, s4, s5 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 29 +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_add_i32 s3, s4, s3 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 30 +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: s_add_i32 s2, s3, s2 +; GFX1032-NEXT: s_bitcmp1_b32 s0, 31 +; GFX1032-NEXT: s_cselect_b32 s0, s1, 0 +; GFX1032-NEXT: s_add_i32 s0, s2, s0 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_u32 v0, v3 +; GFX1032-NEXT: ds_sub_u32 v0, v1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB10_2: @@ -2617,41 +14103,306 @@ ; ; GFX1164-LABEL: sub_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: v_permlane64_b32 v2, v1 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1 -; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1164-NEXT: s_mov_b64 exec, s[0:1] -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1164-NEXT: v_mov_b32_e32 v3, v1 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-NEXT: v_readlane_b32 s66, v0, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX1164-NEXT: v_readlane_b32 s65, v0, 1 +; GFX1164-NEXT: v_readlane_b32 s64, v0, 2 +; GFX1164-NEXT: v_readlane_b32 s63, v0, 3 +; GFX1164-NEXT: v_readlane_b32 s62, v0, 4 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GFX1164-NEXT: v_readlane_b32 s61, v0, 5 +; GFX1164-NEXT: v_readlane_b32 s60, v0, 6 +; GFX1164-NEXT: v_readlane_b32 s59, v0, 7 +; GFX1164-NEXT: v_readlane_b32 s58, v0, 8 +; GFX1164-NEXT: v_readlane_b32 s57, v0, 9 +; GFX1164-NEXT: v_readlane_b32 s56, v0, 10 +; GFX1164-NEXT: v_readlane_b32 s55, v0, 11 +; GFX1164-NEXT: v_readlane_b32 s54, v0, 12 +; GFX1164-NEXT: v_readlane_b32 s53, v0, 13 +; GFX1164-NEXT: v_readlane_b32 s52, v0, 14 +; GFX1164-NEXT: v_readlane_b32 s51, v0, 15 +; GFX1164-NEXT: v_readlane_b32 s50, v0, 16 +; GFX1164-NEXT: v_readlane_b32 s49, v0, 17 +; GFX1164-NEXT: v_readlane_b32 s48, v0, 18 +; GFX1164-NEXT: v_readlane_b32 s47, v0, 19 +; GFX1164-NEXT: v_readlane_b32 s46, v0, 20 +; GFX1164-NEXT: v_readlane_b32 s45, v0, 21 +; GFX1164-NEXT: v_readlane_b32 s44, v0, 22 +; GFX1164-NEXT: v_readlane_b32 s43, v0, 23 +; GFX1164-NEXT: v_readlane_b32 s42, v0, 24 +; GFX1164-NEXT: v_readlane_b32 s41, v0, 25 +; GFX1164-NEXT: v_readlane_b32 s40, v0, 26 +; GFX1164-NEXT: v_readlane_b32 s39, v0, 27 +; GFX1164-NEXT: v_readlane_b32 s38, v0, 28 +; GFX1164-NEXT: v_readlane_b32 s37, v0, 29 +; GFX1164-NEXT: v_readlane_b32 s36, v0, 30 +; GFX1164-NEXT: v_readlane_b32 s35, v0, 31 +; GFX1164-NEXT: v_readlane_b32 s34, v0, 32 +; GFX1164-NEXT: v_readlane_b32 s33, v0, 33 +; GFX1164-NEXT: v_readlane_b32 s31, v0, 34 +; GFX1164-NEXT: v_readlane_b32 s30, v0, 35 +; GFX1164-NEXT: v_readlane_b32 s29, v0, 36 +; GFX1164-NEXT: v_readlane_b32 s28, v0, 37 +; GFX1164-NEXT: v_readlane_b32 s27, v0, 38 +; GFX1164-NEXT: v_readlane_b32 s26, v0, 39 +; GFX1164-NEXT: v_readlane_b32 s25, v0, 40 +; GFX1164-NEXT: v_readlane_b32 s24, v0, 41 +; GFX1164-NEXT: v_readlane_b32 s23, v0, 42 +; GFX1164-NEXT: v_readlane_b32 s22, v0, 43 +; GFX1164-NEXT: v_readlane_b32 s21, v0, 44 +; GFX1164-NEXT: v_readlane_b32 s20, v0, 45 +; GFX1164-NEXT: v_readlane_b32 s19, v0, 46 +; GFX1164-NEXT: v_readlane_b32 s18, v0, 47 +; GFX1164-NEXT: v_readlane_b32 s17, v0, 48 +; GFX1164-NEXT: v_readlane_b32 s16, v0, 49 +; GFX1164-NEXT: v_readlane_b32 s15, v0, 50 +; GFX1164-NEXT: v_readlane_b32 s14, v0, 51 +; GFX1164-NEXT: v_readlane_b32 s13, v0, 52 +; GFX1164-NEXT: v_readlane_b32 s12, v0, 53 +; GFX1164-NEXT: v_readlane_b32 s11, v0, 54 +; GFX1164-NEXT: v_readlane_b32 s10, v0, 55 +; GFX1164-NEXT: v_readlane_b32 s9, v0, 56 +; GFX1164-NEXT: v_readlane_b32 s8, v0, 57 +; GFX1164-NEXT: v_readlane_b32 s6, v0, 58 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 59 +; GFX1164-NEXT: v_readfirstlane_b32 s67, v1 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: v_readlane_b32 s4, v0, 61 +; GFX1164-NEXT: v_readlane_b32 s3, v0, 62 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 63 +; GFX1164-NEXT: s_mov_b64 s[68:69], exec +; GFX1164-NEXT: v_cmpx_eq_u32_e64 s67, v1 ; GFX1164-NEXT: s_cbranch_execz .LBB10_2 ; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_cselect_b32 s66, s66, 0 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 1 +; GFX1164-NEXT: s_cselect_b32 s65, s65, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s65, s66, s65 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 2 +; GFX1164-NEXT: s_cselect_b32 s64, s64, 0 +; GFX1164-NEXT: s_add_i32 s64, s65, s64 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 3 +; GFX1164-NEXT: s_cselect_b32 s63, s63, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s63, s64, s63 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 4 +; GFX1164-NEXT: s_cselect_b32 s62, s62, 0 +; GFX1164-NEXT: s_add_i32 s62, s63, s62 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 5 +; GFX1164-NEXT: s_cselect_b32 s61, s61, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s61, s62, s61 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 6 +; GFX1164-NEXT: s_cselect_b32 s60, s60, 0 +; GFX1164-NEXT: s_add_i32 s60, s61, s60 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 7 +; GFX1164-NEXT: s_cselect_b32 s59, s59, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s59, s60, s59 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 8 +; GFX1164-NEXT: s_cselect_b32 s58, s58, 0 +; GFX1164-NEXT: s_add_i32 s58, s59, s58 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 9 +; GFX1164-NEXT: s_cselect_b32 s57, s57, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s57, s58, s57 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 10 +; GFX1164-NEXT: s_cselect_b32 s56, s56, 0 +; GFX1164-NEXT: s_add_i32 s56, s57, s56 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 11 +; GFX1164-NEXT: s_cselect_b32 s55, s55, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s55, s56, s55 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 12 +; GFX1164-NEXT: s_cselect_b32 s54, s54, 0 +; GFX1164-NEXT: s_add_i32 s54, s55, s54 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 13 +; GFX1164-NEXT: s_cselect_b32 s53, s53, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s53, s54, s53 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 14 +; GFX1164-NEXT: s_cselect_b32 s52, s52, 0 +; GFX1164-NEXT: s_add_i32 s52, s53, s52 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 15 +; GFX1164-NEXT: s_cselect_b32 s51, s51, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s51, s52, s51 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 16 +; GFX1164-NEXT: s_cselect_b32 s50, s50, 0 +; GFX1164-NEXT: s_add_i32 s50, s51, s50 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 17 +; GFX1164-NEXT: s_cselect_b32 s49, s49, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s49, s50, s49 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 18 +; GFX1164-NEXT: s_cselect_b32 s48, s48, 0 +; GFX1164-NEXT: s_add_i32 s48, s49, s48 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 19 +; GFX1164-NEXT: s_cselect_b32 s47, s47, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s47, s48, s47 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 20 +; GFX1164-NEXT: s_cselect_b32 s46, s46, 0 +; GFX1164-NEXT: s_add_i32 s46, s47, s46 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 21 +; GFX1164-NEXT: s_cselect_b32 s45, s45, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s45, s46, s45 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 22 +; GFX1164-NEXT: s_cselect_b32 s44, s44, 0 +; GFX1164-NEXT: s_add_i32 s44, s45, s44 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 23 +; GFX1164-NEXT: s_cselect_b32 s43, s43, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s43, s44, s43 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 24 +; GFX1164-NEXT: s_cselect_b32 s42, s42, 0 +; GFX1164-NEXT: s_add_i32 s42, s43, s42 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 25 +; GFX1164-NEXT: s_cselect_b32 s41, s41, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s41, s42, s41 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 26 +; GFX1164-NEXT: s_cselect_b32 s40, s40, 0 +; GFX1164-NEXT: s_add_i32 s40, s41, s40 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 27 +; GFX1164-NEXT: s_cselect_b32 s39, s39, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s39, s40, s39 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 28 +; GFX1164-NEXT: s_cselect_b32 s38, s38, 0 +; GFX1164-NEXT: s_add_i32 s38, s39, s38 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 29 +; GFX1164-NEXT: s_cselect_b32 s37, s37, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s37, s38, s37 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 30 +; GFX1164-NEXT: s_cselect_b32 s36, s36, 0 +; GFX1164-NEXT: s_add_i32 s36, s37, s36 +; GFX1164-NEXT: s_bitcmp1_b32 s0, 31 +; GFX1164-NEXT: s_cselect_b32 s0, s35, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s36, s0 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 0 +; GFX1164-NEXT: s_cselect_b32 s34, s34, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s34 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 1 +; GFX1164-NEXT: s_cselect_b32 s33, s33, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s33 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 2 +; GFX1164-NEXT: s_cselect_b32 s31, s31, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s31 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 3 +; GFX1164-NEXT: s_cselect_b32 s30, s30, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s30 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 4 +; GFX1164-NEXT: s_cselect_b32 s29, s29, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s29 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 5 +; GFX1164-NEXT: s_cselect_b32 s28, s28, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s28 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 6 +; GFX1164-NEXT: s_cselect_b32 s27, s27, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s27 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 7 +; GFX1164-NEXT: s_cselect_b32 s26, s26, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s26 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 8 +; GFX1164-NEXT: s_cselect_b32 s25, s25, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s25 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 9 +; GFX1164-NEXT: s_cselect_b32 s24, s24, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s24 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 10 +; GFX1164-NEXT: s_cselect_b32 s23, s23, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s23 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 11 +; GFX1164-NEXT: s_cselect_b32 s22, s22, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s22 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 12 +; GFX1164-NEXT: s_cselect_b32 s21, s21, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s21 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 13 +; GFX1164-NEXT: s_cselect_b32 s20, s20, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s20 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 14 +; GFX1164-NEXT: s_cselect_b32 s19, s19, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s19 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 15 +; GFX1164-NEXT: s_cselect_b32 s18, s18, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s18 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 16 +; GFX1164-NEXT: s_cselect_b32 s17, s17, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s17 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 17 +; GFX1164-NEXT: s_cselect_b32 s16, s16, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s16 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 18 +; GFX1164-NEXT: s_cselect_b32 s15, s15, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s15 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 19 +; GFX1164-NEXT: s_cselect_b32 s14, s14, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s14 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 20 +; GFX1164-NEXT: s_cselect_b32 s13, s13, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s13 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 21 +; GFX1164-NEXT: s_cselect_b32 s12, s12, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s12 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 22 +; GFX1164-NEXT: s_cselect_b32 s11, s11, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s11 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 23 +; GFX1164-NEXT: s_cselect_b32 s10, s10, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s10 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 24 +; GFX1164-NEXT: s_cselect_b32 s9, s9, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s9 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 25 +; GFX1164-NEXT: s_cselect_b32 s8, s8, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s8 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 26 +; GFX1164-NEXT: s_cselect_b32 s6, s6, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s6 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 27 +; GFX1164-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s5 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 28 +; GFX1164-NEXT: s_cselect_b32 s5, s7, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s5 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 29 +; GFX1164-NEXT: s_cselect_b32 s4, s4, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s4 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 30 +; GFX1164-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1164-NEXT: s_add_i32 s0, s0, s3 +; GFX1164-NEXT: s_bitcmp1_b32 s1, 31 +; GFX1164-NEXT: s_cselect_b32 s1, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_add_i32 s0, s0, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_u32 v0, v3 +; GFX1164-NEXT: ds_sub_u32 v0, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB10_2: @@ -2659,33 +14410,160 @@ ; ; GFX1132-LABEL: sub_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s0, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX1132-NEXT: s_mov_b32 exec_lo, s0 -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, v1 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-NEXT: v_readlane_b32 s33, v0, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GFX1132-NEXT: v_readlane_b32 s31, v0, 1 +; GFX1132-NEXT: v_readlane_b32 s30, v0, 2 +; GFX1132-NEXT: v_readlane_b32 s29, v0, 3 +; GFX1132-NEXT: v_readlane_b32 s28, v0, 4 +; GFX1132-NEXT: v_readfirstlane_b32 s34, v1 +; GFX1132-NEXT: v_readlane_b32 s27, v0, 5 +; GFX1132-NEXT: v_readlane_b32 s26, v0, 6 +; GFX1132-NEXT: v_readlane_b32 s25, v0, 7 +; GFX1132-NEXT: v_readlane_b32 s24, v0, 8 +; GFX1132-NEXT: v_readlane_b32 s23, v0, 9 +; GFX1132-NEXT: v_readlane_b32 s22, v0, 10 +; GFX1132-NEXT: v_readlane_b32 s21, v0, 11 +; GFX1132-NEXT: v_readlane_b32 s20, v0, 12 +; GFX1132-NEXT: v_readlane_b32 s19, v0, 13 +; GFX1132-NEXT: v_readlane_b32 s18, v0, 14 +; GFX1132-NEXT: v_readlane_b32 s17, v0, 15 +; GFX1132-NEXT: v_readlane_b32 s16, v0, 16 +; GFX1132-NEXT: v_readlane_b32 s15, v0, 17 +; GFX1132-NEXT: v_readlane_b32 s14, v0, 18 +; GFX1132-NEXT: v_readlane_b32 s13, v0, 19 +; GFX1132-NEXT: v_readlane_b32 s12, v0, 20 +; GFX1132-NEXT: v_readlane_b32 s11, v0, 21 +; GFX1132-NEXT: v_readlane_b32 s10, v0, 22 +; GFX1132-NEXT: v_readlane_b32 s9, v0, 23 +; GFX1132-NEXT: v_readlane_b32 s8, v0, 24 +; GFX1132-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1132-NEXT: v_readlane_b32 s5, v0, 26 +; GFX1132-NEXT: v_readlane_b32 s4, v0, 27 +; GFX1132-NEXT: v_readlane_b32 s6, v0, 28 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 30 +; GFX1132-NEXT: v_readlane_b32 s1, v0, 31 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s34, v1 +; GFX1132-NEXT: s_and_saveexec_b32 s34, vcc_lo ; GFX1132-NEXT: s_cbranch_execz .LBB10_2 ; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bitcmp1_b32 s0, 0 +; GFX1132-NEXT: s_cselect_b32 s33, s33, 0 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 1 +; GFX1132-NEXT: s_cselect_b32 s31, s31, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s31, s33, s31 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 2 +; GFX1132-NEXT: s_cselect_b32 s30, s30, 0 +; GFX1132-NEXT: s_add_i32 s30, s31, s30 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 3 +; GFX1132-NEXT: s_cselect_b32 s29, s29, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s29, s30, s29 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 4 +; GFX1132-NEXT: s_cselect_b32 s28, s28, 0 +; GFX1132-NEXT: s_add_i32 s28, s29, s28 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 5 +; GFX1132-NEXT: s_cselect_b32 s27, s27, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s27, s28, s27 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 6 +; GFX1132-NEXT: s_cselect_b32 s26, s26, 0 +; GFX1132-NEXT: s_add_i32 s26, s27, s26 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 7 +; GFX1132-NEXT: s_cselect_b32 s25, s25, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s25, s26, s25 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 8 +; GFX1132-NEXT: s_cselect_b32 s24, s24, 0 +; GFX1132-NEXT: s_add_i32 s24, s25, s24 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 9 +; GFX1132-NEXT: s_cselect_b32 s23, s23, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s23, s24, s23 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 10 +; GFX1132-NEXT: s_cselect_b32 s22, s22, 0 +; GFX1132-NEXT: s_add_i32 s22, s23, s22 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 11 +; GFX1132-NEXT: s_cselect_b32 s21, s21, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s21, s22, s21 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 12 +; GFX1132-NEXT: s_cselect_b32 s20, s20, 0 +; GFX1132-NEXT: s_add_i32 s20, s21, s20 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 13 +; GFX1132-NEXT: s_cselect_b32 s19, s19, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s19, s20, s19 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 14 +; GFX1132-NEXT: s_cselect_b32 s18, s18, 0 +; GFX1132-NEXT: s_add_i32 s18, s19, s18 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 15 +; GFX1132-NEXT: s_cselect_b32 s17, s17, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s17, s18, s17 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 16 +; GFX1132-NEXT: s_cselect_b32 s16, s16, 0 +; GFX1132-NEXT: s_add_i32 s16, s17, s16 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 17 +; GFX1132-NEXT: s_cselect_b32 s15, s15, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s15, s16, s15 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 18 +; GFX1132-NEXT: s_cselect_b32 s14, s14, 0 +; GFX1132-NEXT: s_add_i32 s14, s15, s14 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 19 +; GFX1132-NEXT: s_cselect_b32 s13, s13, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s13, s14, s13 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 20 +; GFX1132-NEXT: s_cselect_b32 s12, s12, 0 +; GFX1132-NEXT: s_add_i32 s12, s13, s12 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 21 +; GFX1132-NEXT: s_cselect_b32 s11, s11, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s11, s12, s11 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 22 +; GFX1132-NEXT: s_cselect_b32 s10, s10, 0 +; GFX1132-NEXT: s_add_i32 s10, s11, s10 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 23 +; GFX1132-NEXT: s_cselect_b32 s9, s9, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s9, s10, s9 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 24 +; GFX1132-NEXT: s_cselect_b32 s8, s8, 0 +; GFX1132-NEXT: s_add_i32 s8, s9, s8 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 25 +; GFX1132-NEXT: s_cselect_b32 s7, s7, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s7, s8, s7 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 26 +; GFX1132-NEXT: s_cselect_b32 s5, s5, 0 +; GFX1132-NEXT: s_add_i32 s5, s7, s5 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 27 +; GFX1132-NEXT: s_cselect_b32 s4, s4, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s4, s5, s4 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 28 +; GFX1132-NEXT: s_cselect_b32 s5, s6, 0 +; GFX1132-NEXT: s_add_i32 s4, s4, s5 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 29 +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s3, s4, s3 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 30 +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: s_add_i32 s2, s3, s2 +; GFX1132-NEXT: s_bitcmp1_b32 s0, 31 +; GFX1132-NEXT: s_cselect_b32 s0, s1, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_add_i32 s0, s2, s0 +; GFX1132-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_u32 v0, v3 +; GFX1132-NEXT: ds_sub_u32 v0, v1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB10_2: @@ -3344,273 +15222,4557 @@ ; ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, -1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, -1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, -1, 0 +; GFX8-NEXT: s_branch .LBB14_3 +; GFX8-NEXT: .LBB14_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, -1 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB14_6 +; GFX8-NEXT: .LBB14_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB14_9 +; GFX8-NEXT: .LBB14_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB14_12 +; GFX8-NEXT: .LBB14_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB14_15 +; GFX8-NEXT: .LBB14_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB14_18 +; GFX8-NEXT: .LBB14_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB14_21 +; GFX8-NEXT: .LBB14_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB14_24 +; GFX8-NEXT: .LBB14_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB14_27 +; GFX8-NEXT: .LBB14_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB14_30 +; GFX8-NEXT: .LBB14_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB14_33 +; GFX8-NEXT: .LBB14_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB14_36 +; GFX8-NEXT: .LBB14_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB14_39 +; GFX8-NEXT: .LBB14_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB14_42 +; GFX8-NEXT: .LBB14_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB14_45 +; GFX8-NEXT: .LBB14_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB14_48 +; GFX8-NEXT: .LBB14_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB14_51 +; GFX8-NEXT: .LBB14_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB14_54 +; GFX8-NEXT: .LBB14_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB14_57 +; GFX8-NEXT: .LBB14_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB14_60 +; GFX8-NEXT: .LBB14_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB14_63 +; GFX8-NEXT: .LBB14_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB14_66 +; GFX8-NEXT: .LBB14_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB14_69 +; GFX8-NEXT: .LBB14_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB14_72 +; GFX8-NEXT: .LBB14_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB14_75 +; GFX8-NEXT: .LBB14_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB14_78 +; GFX8-NEXT: .LBB14_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB14_81 +; GFX8-NEXT: .LBB14_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB14_84 +; GFX8-NEXT: .LBB14_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB14_87 +; GFX8-NEXT: .LBB14_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB14_90 +; GFX8-NEXT: .LBB14_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB14_93 +; GFX8-NEXT: .LBB14_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, -1 +; GFX8-NEXT: s_and_b32 s4, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s5, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s4, 31 +; GFX8-NEXT: s_branch .LBB14_96 +; GFX8-NEXT: .LBB14_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_96: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, -1 +; GFX8-NEXT: s_and_b32 s6, s4, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB14_99 +; GFX8-NEXT: .LBB14_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB14_102 +; GFX8-NEXT: .LBB14_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB14_105 +; GFX8-NEXT: .LBB14_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB14_108 +; GFX8-NEXT: .LBB14_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB14_111 +; GFX8-NEXT: .LBB14_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB14_114 +; GFX8-NEXT: .LBB14_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB14_117 +; GFX8-NEXT: .LBB14_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB14_120 +; GFX8-NEXT: .LBB14_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB14_123 +; GFX8-NEXT: .LBB14_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB14_126 +; GFX8-NEXT: .LBB14_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB14_129 +; GFX8-NEXT: .LBB14_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB14_132 +; GFX8-NEXT: .LBB14_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB14_135 +; GFX8-NEXT: .LBB14_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB14_138 +; GFX8-NEXT: .LBB14_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB14_141 +; GFX8-NEXT: .LBB14_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB14_144 +; GFX8-NEXT: .LBB14_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB14_147 +; GFX8-NEXT: .LBB14_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB14_150 +; GFX8-NEXT: .LBB14_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB14_153 +; GFX8-NEXT: .LBB14_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB14_156 +; GFX8-NEXT: .LBB14_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB14_159 +; GFX8-NEXT: .LBB14_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB14_162 +; GFX8-NEXT: .LBB14_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB14_165 +; GFX8-NEXT: .LBB14_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB14_168 +; GFX8-NEXT: .LBB14_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB14_171 +; GFX8-NEXT: .LBB14_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB14_174 +; GFX8-NEXT: .LBB14_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB14_177 +; GFX8-NEXT: .LBB14_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB14_180 +; GFX8-NEXT: .LBB14_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB14_183 +; GFX8-NEXT: .LBB14_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB14_186 +; GFX8-NEXT: .LBB14_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB14_189 +; GFX8-NEXT: .LBB14_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_and_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB14_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB14_192 +; GFX8-NEXT: .LBB14_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB14_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB14_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_cbranch_execz .LBB14_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, -1 +; GFX8-NEXT: s_and_b32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB14_2: +; GFX8-NEXT: .LBB14_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, -1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, -1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_writelane_b32 v1, -1, 0 +; GFX9-NEXT: s_branch .LBB14_3 +; GFX9-NEXT: .LBB14_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, -1 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB14_6 +; GFX9-NEXT: .LBB14_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB14_9 +; GFX9-NEXT: .LBB14_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB14_12 +; GFX9-NEXT: .LBB14_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB14_15 +; GFX9-NEXT: .LBB14_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB14_18 +; GFX9-NEXT: .LBB14_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB14_21 +; GFX9-NEXT: .LBB14_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB14_24 +; GFX9-NEXT: .LBB14_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB14_27 +; GFX9-NEXT: .LBB14_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB14_30 +; GFX9-NEXT: .LBB14_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB14_33 +; GFX9-NEXT: .LBB14_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB14_36 +; GFX9-NEXT: .LBB14_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB14_39 +; GFX9-NEXT: .LBB14_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB14_42 +; GFX9-NEXT: .LBB14_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB14_45 +; GFX9-NEXT: .LBB14_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB14_48 +; GFX9-NEXT: .LBB14_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB14_51 +; GFX9-NEXT: .LBB14_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB14_54 +; GFX9-NEXT: .LBB14_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB14_57 +; GFX9-NEXT: .LBB14_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB14_60 +; GFX9-NEXT: .LBB14_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB14_63 +; GFX9-NEXT: .LBB14_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB14_66 +; GFX9-NEXT: .LBB14_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB14_69 +; GFX9-NEXT: .LBB14_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB14_72 +; GFX9-NEXT: .LBB14_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB14_75 +; GFX9-NEXT: .LBB14_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB14_78 +; GFX9-NEXT: .LBB14_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB14_81 +; GFX9-NEXT: .LBB14_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB14_84 +; GFX9-NEXT: .LBB14_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB14_87 +; GFX9-NEXT: .LBB14_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB14_90 +; GFX9-NEXT: .LBB14_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB14_93 +; GFX9-NEXT: .LBB14_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, -1 +; GFX9-NEXT: s_and_b32 s4, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s5, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s4, 31 +; GFX9-NEXT: s_branch .LBB14_96 +; GFX9-NEXT: .LBB14_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_96: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, -1 +; GFX9-NEXT: s_and_b32 s6, s4, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB14_99 +; GFX9-NEXT: .LBB14_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB14_102 +; GFX9-NEXT: .LBB14_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB14_105 +; GFX9-NEXT: .LBB14_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB14_108 +; GFX9-NEXT: .LBB14_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB14_111 +; GFX9-NEXT: .LBB14_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB14_114 +; GFX9-NEXT: .LBB14_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB14_117 +; GFX9-NEXT: .LBB14_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB14_120 +; GFX9-NEXT: .LBB14_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB14_123 +; GFX9-NEXT: .LBB14_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB14_126 +; GFX9-NEXT: .LBB14_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB14_129 +; GFX9-NEXT: .LBB14_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB14_132 +; GFX9-NEXT: .LBB14_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB14_135 +; GFX9-NEXT: .LBB14_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB14_138 +; GFX9-NEXT: .LBB14_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB14_141 +; GFX9-NEXT: .LBB14_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB14_144 +; GFX9-NEXT: .LBB14_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB14_147 +; GFX9-NEXT: .LBB14_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB14_150 +; GFX9-NEXT: .LBB14_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB14_153 +; GFX9-NEXT: .LBB14_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB14_156 +; GFX9-NEXT: .LBB14_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB14_159 +; GFX9-NEXT: .LBB14_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB14_162 +; GFX9-NEXT: .LBB14_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB14_165 +; GFX9-NEXT: .LBB14_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB14_168 +; GFX9-NEXT: .LBB14_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB14_171 +; GFX9-NEXT: .LBB14_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB14_174 +; GFX9-NEXT: .LBB14_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB14_177 +; GFX9-NEXT: .LBB14_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB14_180 +; GFX9-NEXT: .LBB14_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB14_183 +; GFX9-NEXT: .LBB14_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB14_186 +; GFX9-NEXT: .LBB14_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB14_189 +; GFX9-NEXT: .LBB14_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_and_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB14_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB14_192 +; GFX9-NEXT: .LBB14_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB14_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB14_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_cbranch_execz .LBB14_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, -1 +; GFX9-NEXT: s_and_b32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 +; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB14_2: +; GFX9-NEXT: .LBB14_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: and_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_writelane_b32 v1, -1, 0 +; GFX1064-NEXT: s_branch .LBB14_3 +; GFX1064-NEXT: .LBB14_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s6, s2, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB14_6 +; GFX1064-NEXT: .LBB14_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB14_9 +; GFX1064-NEXT: .LBB14_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB14_12 +; GFX1064-NEXT: .LBB14_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB14_15 +; GFX1064-NEXT: .LBB14_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB14_18 +; GFX1064-NEXT: .LBB14_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB14_21 +; GFX1064-NEXT: .LBB14_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB14_24 +; GFX1064-NEXT: .LBB14_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB14_27 +; GFX1064-NEXT: .LBB14_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB14_30 +; GFX1064-NEXT: .LBB14_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB14_33 +; GFX1064-NEXT: .LBB14_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB14_36 +; GFX1064-NEXT: .LBB14_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB14_39 +; GFX1064-NEXT: .LBB14_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB14_42 +; GFX1064-NEXT: .LBB14_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB14_45 +; GFX1064-NEXT: .LBB14_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB14_48 +; GFX1064-NEXT: .LBB14_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB14_51 +; GFX1064-NEXT: .LBB14_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB14_54 +; GFX1064-NEXT: .LBB14_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB14_57 +; GFX1064-NEXT: .LBB14_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB14_60 +; GFX1064-NEXT: .LBB14_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB14_63 +; GFX1064-NEXT: .LBB14_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB14_66 +; GFX1064-NEXT: .LBB14_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB14_69 +; GFX1064-NEXT: .LBB14_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB14_72 +; GFX1064-NEXT: .LBB14_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB14_75 +; GFX1064-NEXT: .LBB14_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB14_78 +; GFX1064-NEXT: .LBB14_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB14_81 +; GFX1064-NEXT: .LBB14_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB14_84 +; GFX1064-NEXT: .LBB14_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB14_87 +; GFX1064-NEXT: .LBB14_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB14_90 +; GFX1064-NEXT: .LBB14_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s8, exec_lo, 2.0 +; GFX1064-NEXT: s_and_b32 s4, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_mov_b32 s9, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[8:9], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB14_93 +; GFX1064-NEXT: .LBB14_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, -1 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1064-NEXT: s_and_b32 s4, s4, s2 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1064-NEXT: s_branch .LBB14_96 +; GFX1064-NEXT: .LBB14_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_and_b32 s6, s4, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB14_99 +; GFX1064-NEXT: .LBB14_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB14_102 +; GFX1064-NEXT: .LBB14_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB14_105 +; GFX1064-NEXT: .LBB14_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB14_108 +; GFX1064-NEXT: .LBB14_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB14_111 +; GFX1064-NEXT: .LBB14_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB14_114 +; GFX1064-NEXT: .LBB14_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB14_117 +; GFX1064-NEXT: .LBB14_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB14_120 +; GFX1064-NEXT: .LBB14_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB14_123 +; GFX1064-NEXT: .LBB14_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB14_126 +; GFX1064-NEXT: .LBB14_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB14_129 +; GFX1064-NEXT: .LBB14_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB14_132 +; GFX1064-NEXT: .LBB14_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB14_135 +; GFX1064-NEXT: .LBB14_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB14_138 +; GFX1064-NEXT: .LBB14_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB14_141 +; GFX1064-NEXT: .LBB14_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB14_144 +; GFX1064-NEXT: .LBB14_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB14_147 +; GFX1064-NEXT: .LBB14_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB14_150 +; GFX1064-NEXT: .LBB14_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB14_153 +; GFX1064-NEXT: .LBB14_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB14_156 +; GFX1064-NEXT: .LBB14_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB14_159 +; GFX1064-NEXT: .LBB14_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB14_162 +; GFX1064-NEXT: .LBB14_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB14_165 +; GFX1064-NEXT: .LBB14_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB14_168 +; GFX1064-NEXT: .LBB14_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB14_171 +; GFX1064-NEXT: .LBB14_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB14_174 +; GFX1064-NEXT: .LBB14_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB14_177 +; GFX1064-NEXT: .LBB14_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB14_180 +; GFX1064-NEXT: .LBB14_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB14_183 +; GFX1064-NEXT: .LBB14_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_and_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1064-NEXT: s_branch .LBB14_186 +; GFX1064-NEXT: .LBB14_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_and_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1064-NEXT: s_branch .LBB14_189 +; GFX1064-NEXT: .LBB14_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_and_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB14_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1064-NEXT: s_branch .LBB14_192 +; GFX1064-NEXT: .LBB14_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB14_192: +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB14_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB14_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_and_b32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB14_2: +; GFX1064-NEXT: .LBB14_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: and_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v1, -1, 0 +; GFX1032-NEXT: s_branch .LBB14_3 +; GFX1032-NEXT: .LBB14_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB14_6 +; GFX1032-NEXT: .LBB14_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB14_9 +; GFX1032-NEXT: .LBB14_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB14_12 +; GFX1032-NEXT: .LBB14_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB14_15 +; GFX1032-NEXT: .LBB14_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB14_18 +; GFX1032-NEXT: .LBB14_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB14_21 +; GFX1032-NEXT: .LBB14_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB14_24 +; GFX1032-NEXT: .LBB14_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB14_27 +; GFX1032-NEXT: .LBB14_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB14_30 +; GFX1032-NEXT: .LBB14_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB14_33 +; GFX1032-NEXT: .LBB14_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB14_36 +; GFX1032-NEXT: .LBB14_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB14_39 +; GFX1032-NEXT: .LBB14_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB14_42 +; GFX1032-NEXT: .LBB14_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB14_45 +; GFX1032-NEXT: .LBB14_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB14_48 +; GFX1032-NEXT: .LBB14_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB14_51 +; GFX1032-NEXT: .LBB14_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB14_54 +; GFX1032-NEXT: .LBB14_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB14_57 +; GFX1032-NEXT: .LBB14_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB14_60 +; GFX1032-NEXT: .LBB14_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB14_63 +; GFX1032-NEXT: .LBB14_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB14_66 +; GFX1032-NEXT: .LBB14_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB14_69 +; GFX1032-NEXT: .LBB14_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB14_72 +; GFX1032-NEXT: .LBB14_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB14_75 +; GFX1032-NEXT: .LBB14_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB14_78 +; GFX1032-NEXT: .LBB14_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB14_81 +; GFX1032-NEXT: .LBB14_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB14_84 +; GFX1032-NEXT: .LBB14_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB14_87 +; GFX1032-NEXT: .LBB14_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB14_90 +; GFX1032-NEXT: .LBB14_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1032-NEXT: s_branch .LBB14_93 +; GFX1032-NEXT: .LBB14_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_93: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB14_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1032-NEXT: s_branch .LBB14_96 +; GFX1032-NEXT: .LBB14_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB14_96: +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB14_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB14_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: s_and_b32 s2, s2, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB14_2: +; GFX1032-NEXT: .LBB14_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: and_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_writelane_b32 v1, -1, 0 +; GFX1164-NEXT: s_branch .LBB14_3 +; GFX1164-NEXT: .LBB14_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s6, s2, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB14_6 +; GFX1164-NEXT: .LBB14_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB14_9 +; GFX1164-NEXT: .LBB14_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB14_12 +; GFX1164-NEXT: .LBB14_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB14_15 +; GFX1164-NEXT: .LBB14_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB14_18 +; GFX1164-NEXT: .LBB14_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB14_21 +; GFX1164-NEXT: .LBB14_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB14_24 +; GFX1164-NEXT: .LBB14_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB14_27 +; GFX1164-NEXT: .LBB14_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB14_30 +; GFX1164-NEXT: .LBB14_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB14_33 +; GFX1164-NEXT: .LBB14_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB14_36 +; GFX1164-NEXT: .LBB14_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB14_39 +; GFX1164-NEXT: .LBB14_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB14_42 +; GFX1164-NEXT: .LBB14_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB14_45 +; GFX1164-NEXT: .LBB14_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB14_48 +; GFX1164-NEXT: .LBB14_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB14_51 +; GFX1164-NEXT: .LBB14_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB14_54 +; GFX1164-NEXT: .LBB14_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB14_57 +; GFX1164-NEXT: .LBB14_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB14_60 +; GFX1164-NEXT: .LBB14_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB14_63 +; GFX1164-NEXT: .LBB14_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB14_66 +; GFX1164-NEXT: .LBB14_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB14_69 +; GFX1164-NEXT: .LBB14_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB14_72 +; GFX1164-NEXT: .LBB14_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB14_75 +; GFX1164-NEXT: .LBB14_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB14_78 +; GFX1164-NEXT: .LBB14_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB14_81 +; GFX1164-NEXT: .LBB14_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB14_84 +; GFX1164-NEXT: .LBB14_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB14_87 +; GFX1164-NEXT: .LBB14_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB14_90 +; GFX1164-NEXT: .LBB14_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s8, exec_lo, 2.0 +; GFX1164-NEXT: s_and_b32 s4, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_mov_b32 s9, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[8:9], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB14_93 +; GFX1164-NEXT: .LBB14_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, -1 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1164-NEXT: s_and_b32 s4, s4, s2 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1164-NEXT: s_branch .LBB14_96 +; GFX1164-NEXT: .LBB14_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_and_b32 s6, s4, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB14_99 +; GFX1164-NEXT: .LBB14_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB14_102 +; GFX1164-NEXT: .LBB14_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB14_105 +; GFX1164-NEXT: .LBB14_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB14_108 +; GFX1164-NEXT: .LBB14_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB14_111 +; GFX1164-NEXT: .LBB14_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB14_114 +; GFX1164-NEXT: .LBB14_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB14_117 +; GFX1164-NEXT: .LBB14_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB14_120 +; GFX1164-NEXT: .LBB14_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB14_123 +; GFX1164-NEXT: .LBB14_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB14_126 +; GFX1164-NEXT: .LBB14_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB14_129 +; GFX1164-NEXT: .LBB14_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB14_132 +; GFX1164-NEXT: .LBB14_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB14_135 +; GFX1164-NEXT: .LBB14_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB14_138 +; GFX1164-NEXT: .LBB14_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB14_141 +; GFX1164-NEXT: .LBB14_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB14_144 +; GFX1164-NEXT: .LBB14_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB14_147 +; GFX1164-NEXT: .LBB14_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB14_150 +; GFX1164-NEXT: .LBB14_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB14_153 +; GFX1164-NEXT: .LBB14_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB14_156 +; GFX1164-NEXT: .LBB14_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB14_159 +; GFX1164-NEXT: .LBB14_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB14_162 +; GFX1164-NEXT: .LBB14_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB14_165 +; GFX1164-NEXT: .LBB14_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB14_168 +; GFX1164-NEXT: .LBB14_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB14_171 +; GFX1164-NEXT: .LBB14_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB14_174 +; GFX1164-NEXT: .LBB14_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB14_177 +; GFX1164-NEXT: .LBB14_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB14_180 +; GFX1164-NEXT: .LBB14_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB14_183 +; GFX1164-NEXT: .LBB14_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_and_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1164-NEXT: s_branch .LBB14_186 +; GFX1164-NEXT: .LBB14_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_and_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1164-NEXT: s_branch .LBB14_189 +; GFX1164-NEXT: .LBB14_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_and_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB14_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1164-NEXT: s_branch .LBB14_192 +; GFX1164-NEXT: .LBB14_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB14_192: +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB14_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB14_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_and_b32 s4, s6, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1164-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB14_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB14_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3618,53 +19780,510 @@ ; ; GFX1132-LABEL: and_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_writelane_b32 v1, -1, 0 +; GFX1132-NEXT: s_branch .LBB14_3 +; GFX1132-NEXT: .LBB14_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB14_6 +; GFX1132-NEXT: .LBB14_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB14_9 +; GFX1132-NEXT: .LBB14_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB14_12 +; GFX1132-NEXT: .LBB14_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB14_15 +; GFX1132-NEXT: .LBB14_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB14_18 +; GFX1132-NEXT: .LBB14_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB14_21 +; GFX1132-NEXT: .LBB14_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB14_24 +; GFX1132-NEXT: .LBB14_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB14_27 +; GFX1132-NEXT: .LBB14_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB14_30 +; GFX1132-NEXT: .LBB14_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB14_33 +; GFX1132-NEXT: .LBB14_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB14_36 +; GFX1132-NEXT: .LBB14_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB14_39 +; GFX1132-NEXT: .LBB14_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB14_42 +; GFX1132-NEXT: .LBB14_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB14_45 +; GFX1132-NEXT: .LBB14_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB14_48 +; GFX1132-NEXT: .LBB14_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB14_51 +; GFX1132-NEXT: .LBB14_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB14_54 +; GFX1132-NEXT: .LBB14_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB14_57 +; GFX1132-NEXT: .LBB14_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB14_60 +; GFX1132-NEXT: .LBB14_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB14_63 +; GFX1132-NEXT: .LBB14_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB14_66 +; GFX1132-NEXT: .LBB14_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB14_69 +; GFX1132-NEXT: .LBB14_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB14_72 +; GFX1132-NEXT: .LBB14_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB14_75 +; GFX1132-NEXT: .LBB14_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB14_78 +; GFX1132-NEXT: .LBB14_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB14_81 +; GFX1132-NEXT: .LBB14_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB14_84 +; GFX1132-NEXT: .LBB14_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB14_87 +; GFX1132-NEXT: .LBB14_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB14_90 +; GFX1132-NEXT: .LBB14_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1132-NEXT: s_branch .LBB14_93 +; GFX1132-NEXT: .LBB14_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_93: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB14_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1132-NEXT: s_branch .LBB14_96 +; GFX1132-NEXT: .LBB14_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB14_96: +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB14_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB14_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: s_and_b32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v4 +; GFX1132-NEXT: ds_and_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB14_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB14_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3694,269 +20313,4527 @@ ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB15_3 +; GFX8-NEXT: .LBB15_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB15_6 +; GFX8-NEXT: .LBB15_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB15_9 +; GFX8-NEXT: .LBB15_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB15_12 +; GFX8-NEXT: .LBB15_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB15_15 +; GFX8-NEXT: .LBB15_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB15_18 +; GFX8-NEXT: .LBB15_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB15_21 +; GFX8-NEXT: .LBB15_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB15_24 +; GFX8-NEXT: .LBB15_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB15_27 +; GFX8-NEXT: .LBB15_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB15_30 +; GFX8-NEXT: .LBB15_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB15_33 +; GFX8-NEXT: .LBB15_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB15_36 +; GFX8-NEXT: .LBB15_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB15_39 +; GFX8-NEXT: .LBB15_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB15_42 +; GFX8-NEXT: .LBB15_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB15_45 +; GFX8-NEXT: .LBB15_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB15_48 +; GFX8-NEXT: .LBB15_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB15_51 +; GFX8-NEXT: .LBB15_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB15_54 +; GFX8-NEXT: .LBB15_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB15_57 +; GFX8-NEXT: .LBB15_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB15_60 +; GFX8-NEXT: .LBB15_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB15_63 +; GFX8-NEXT: .LBB15_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB15_66 +; GFX8-NEXT: .LBB15_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB15_69 +; GFX8-NEXT: .LBB15_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB15_72 +; GFX8-NEXT: .LBB15_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB15_75 +; GFX8-NEXT: .LBB15_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB15_78 +; GFX8-NEXT: .LBB15_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB15_81 +; GFX8-NEXT: .LBB15_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB15_84 +; GFX8-NEXT: .LBB15_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB15_87 +; GFX8-NEXT: .LBB15_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB15_90 +; GFX8-NEXT: .LBB15_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_or_b32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB15_93 +; GFX8-NEXT: .LBB15_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_or_b32 s3, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s6, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s3, 31 +; GFX8-NEXT: s_branch .LBB15_96 +; GFX8-NEXT: .LBB15_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s6, 0 +; GFX8-NEXT: s_or_b32 s6, s3, s4 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB15_99 +; GFX8-NEXT: .LBB15_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB15_102 +; GFX8-NEXT: .LBB15_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB15_105 +; GFX8-NEXT: .LBB15_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB15_108 +; GFX8-NEXT: .LBB15_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB15_111 +; GFX8-NEXT: .LBB15_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB15_114 +; GFX8-NEXT: .LBB15_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB15_117 +; GFX8-NEXT: .LBB15_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB15_120 +; GFX8-NEXT: .LBB15_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB15_123 +; GFX8-NEXT: .LBB15_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB15_126 +; GFX8-NEXT: .LBB15_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB15_129 +; GFX8-NEXT: .LBB15_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB15_132 +; GFX8-NEXT: .LBB15_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB15_135 +; GFX8-NEXT: .LBB15_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB15_138 +; GFX8-NEXT: .LBB15_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB15_141 +; GFX8-NEXT: .LBB15_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB15_144 +; GFX8-NEXT: .LBB15_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB15_147 +; GFX8-NEXT: .LBB15_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB15_150 +; GFX8-NEXT: .LBB15_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB15_153 +; GFX8-NEXT: .LBB15_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB15_156 +; GFX8-NEXT: .LBB15_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB15_159 +; GFX8-NEXT: .LBB15_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB15_162 +; GFX8-NEXT: .LBB15_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB15_165 +; GFX8-NEXT: .LBB15_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB15_168 +; GFX8-NEXT: .LBB15_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB15_171 +; GFX8-NEXT: .LBB15_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB15_174 +; GFX8-NEXT: .LBB15_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB15_177 +; GFX8-NEXT: .LBB15_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB15_180 +; GFX8-NEXT: .LBB15_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB15_183 +; GFX8-NEXT: .LBB15_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB15_186 +; GFX8-NEXT: .LBB15_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB15_189 +; GFX8-NEXT: .LBB15_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_or_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB15_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB15_192 +; GFX8-NEXT: .LBB15_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB15_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB15_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_cbranch_execz .LBB15_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_or_b32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB15_2: +; GFX8-NEXT: .LBB15_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB15_3 +; GFX9-NEXT: .LBB15_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB15_6 +; GFX9-NEXT: .LBB15_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB15_9 +; GFX9-NEXT: .LBB15_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB15_12 +; GFX9-NEXT: .LBB15_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB15_15 +; GFX9-NEXT: .LBB15_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB15_18 +; GFX9-NEXT: .LBB15_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB15_21 +; GFX9-NEXT: .LBB15_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB15_24 +; GFX9-NEXT: .LBB15_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB15_27 +; GFX9-NEXT: .LBB15_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB15_30 +; GFX9-NEXT: .LBB15_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB15_33 +; GFX9-NEXT: .LBB15_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB15_36 +; GFX9-NEXT: .LBB15_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB15_39 +; GFX9-NEXT: .LBB15_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB15_42 +; GFX9-NEXT: .LBB15_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB15_45 +; GFX9-NEXT: .LBB15_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB15_48 +; GFX9-NEXT: .LBB15_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB15_51 +; GFX9-NEXT: .LBB15_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB15_54 +; GFX9-NEXT: .LBB15_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB15_57 +; GFX9-NEXT: .LBB15_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB15_60 +; GFX9-NEXT: .LBB15_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB15_63 +; GFX9-NEXT: .LBB15_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB15_66 +; GFX9-NEXT: .LBB15_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB15_69 +; GFX9-NEXT: .LBB15_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB15_72 +; GFX9-NEXT: .LBB15_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB15_75 +; GFX9-NEXT: .LBB15_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB15_78 +; GFX9-NEXT: .LBB15_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB15_81 +; GFX9-NEXT: .LBB15_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB15_84 +; GFX9-NEXT: .LBB15_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB15_87 +; GFX9-NEXT: .LBB15_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB15_90 +; GFX9-NEXT: .LBB15_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_or_b32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB15_93 +; GFX9-NEXT: .LBB15_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_or_b32 s3, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s6, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s3, 31 +; GFX9-NEXT: s_branch .LBB15_96 +; GFX9-NEXT: .LBB15_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s6, 0 +; GFX9-NEXT: s_or_b32 s6, s3, s4 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB15_99 +; GFX9-NEXT: .LBB15_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB15_102 +; GFX9-NEXT: .LBB15_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB15_105 +; GFX9-NEXT: .LBB15_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB15_108 +; GFX9-NEXT: .LBB15_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB15_111 +; GFX9-NEXT: .LBB15_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB15_114 +; GFX9-NEXT: .LBB15_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB15_117 +; GFX9-NEXT: .LBB15_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB15_120 +; GFX9-NEXT: .LBB15_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB15_123 +; GFX9-NEXT: .LBB15_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB15_126 +; GFX9-NEXT: .LBB15_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB15_129 +; GFX9-NEXT: .LBB15_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB15_132 +; GFX9-NEXT: .LBB15_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB15_135 +; GFX9-NEXT: .LBB15_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB15_138 +; GFX9-NEXT: .LBB15_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB15_141 +; GFX9-NEXT: .LBB15_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB15_144 +; GFX9-NEXT: .LBB15_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB15_147 +; GFX9-NEXT: .LBB15_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB15_150 +; GFX9-NEXT: .LBB15_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB15_153 +; GFX9-NEXT: .LBB15_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB15_156 +; GFX9-NEXT: .LBB15_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB15_159 +; GFX9-NEXT: .LBB15_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB15_162 +; GFX9-NEXT: .LBB15_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB15_165 +; GFX9-NEXT: .LBB15_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB15_168 +; GFX9-NEXT: .LBB15_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB15_171 +; GFX9-NEXT: .LBB15_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB15_174 +; GFX9-NEXT: .LBB15_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB15_177 +; GFX9-NEXT: .LBB15_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB15_180 +; GFX9-NEXT: .LBB15_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB15_183 +; GFX9-NEXT: .LBB15_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB15_186 +; GFX9-NEXT: .LBB15_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB15_189 +; GFX9-NEXT: .LBB15_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_or_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB15_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB15_192 +; GFX9-NEXT: .LBB15_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB15_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB15_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_cbranch_execz .LBB15_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_or_b32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_or_rtn_b32 v0, v3, v0 +; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB15_2: +; GFX9-NEXT: .LBB15_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_or_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1064-NEXT: s_branch .LBB15_3 +; GFX1064-NEXT: .LBB15_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB15_6 +; GFX1064-NEXT: .LBB15_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB15_9 +; GFX1064-NEXT: .LBB15_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB15_12 +; GFX1064-NEXT: .LBB15_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB15_15 +; GFX1064-NEXT: .LBB15_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB15_18 +; GFX1064-NEXT: .LBB15_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB15_21 +; GFX1064-NEXT: .LBB15_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB15_24 +; GFX1064-NEXT: .LBB15_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB15_27 +; GFX1064-NEXT: .LBB15_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB15_30 +; GFX1064-NEXT: .LBB15_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB15_33 +; GFX1064-NEXT: .LBB15_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB15_36 +; GFX1064-NEXT: .LBB15_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB15_39 +; GFX1064-NEXT: .LBB15_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB15_42 +; GFX1064-NEXT: .LBB15_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB15_45 +; GFX1064-NEXT: .LBB15_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB15_48 +; GFX1064-NEXT: .LBB15_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB15_51 +; GFX1064-NEXT: .LBB15_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB15_54 +; GFX1064-NEXT: .LBB15_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB15_57 +; GFX1064-NEXT: .LBB15_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB15_60 +; GFX1064-NEXT: .LBB15_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB15_63 +; GFX1064-NEXT: .LBB15_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB15_66 +; GFX1064-NEXT: .LBB15_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB15_69 +; GFX1064-NEXT: .LBB15_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB15_72 +; GFX1064-NEXT: .LBB15_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB15_75 +; GFX1064-NEXT: .LBB15_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB15_78 +; GFX1064-NEXT: .LBB15_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB15_81 +; GFX1064-NEXT: .LBB15_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB15_84 +; GFX1064-NEXT: .LBB15_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_or_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB15_87 +; GFX1064-NEXT: .LBB15_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB15_90 +; GFX1064-NEXT: .LBB15_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_or_b32 s4, s6, s2 +; GFX1064-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB15_93 +; GFX1064-NEXT: .LBB15_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1064-NEXT: s_or_b32 s4, s4, s2 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1064-NEXT: s_branch .LBB15_96 +; GFX1064-NEXT: .LBB15_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s3, s5, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_or_b32 s6, s4, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB15_99 +; GFX1064-NEXT: .LBB15_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB15_102 +; GFX1064-NEXT: .LBB15_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB15_105 +; GFX1064-NEXT: .LBB15_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB15_108 +; GFX1064-NEXT: .LBB15_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB15_111 +; GFX1064-NEXT: .LBB15_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB15_114 +; GFX1064-NEXT: .LBB15_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB15_117 +; GFX1064-NEXT: .LBB15_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB15_120 +; GFX1064-NEXT: .LBB15_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB15_123 +; GFX1064-NEXT: .LBB15_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB15_126 +; GFX1064-NEXT: .LBB15_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB15_129 +; GFX1064-NEXT: .LBB15_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB15_132 +; GFX1064-NEXT: .LBB15_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB15_135 +; GFX1064-NEXT: .LBB15_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB15_138 +; GFX1064-NEXT: .LBB15_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB15_141 +; GFX1064-NEXT: .LBB15_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB15_144 +; GFX1064-NEXT: .LBB15_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB15_147 +; GFX1064-NEXT: .LBB15_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB15_150 +; GFX1064-NEXT: .LBB15_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB15_153 +; GFX1064-NEXT: .LBB15_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB15_156 +; GFX1064-NEXT: .LBB15_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB15_159 +; GFX1064-NEXT: .LBB15_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB15_162 +; GFX1064-NEXT: .LBB15_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB15_165 +; GFX1064-NEXT: .LBB15_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB15_168 +; GFX1064-NEXT: .LBB15_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB15_171 +; GFX1064-NEXT: .LBB15_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB15_174 +; GFX1064-NEXT: .LBB15_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB15_177 +; GFX1064-NEXT: .LBB15_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB15_180 +; GFX1064-NEXT: .LBB15_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB15_183 +; GFX1064-NEXT: .LBB15_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_or_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1064-NEXT: s_branch .LBB15_186 +; GFX1064-NEXT: .LBB15_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1064-NEXT: s_branch .LBB15_189 +; GFX1064-NEXT: .LBB15_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_or_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB15_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1064-NEXT: s_branch .LBB15_192 +; GFX1064-NEXT: .LBB15_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB15_192: +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB15_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB15_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: s_or_b32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB15_2: +; GFX1064-NEXT: .LBB15_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1032-NEXT: s_branch .LBB15_3 +; GFX1032-NEXT: .LBB15_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB15_6 +; GFX1032-NEXT: .LBB15_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB15_9 +; GFX1032-NEXT: .LBB15_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB15_12 +; GFX1032-NEXT: .LBB15_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB15_15 +; GFX1032-NEXT: .LBB15_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB15_18 +; GFX1032-NEXT: .LBB15_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB15_21 +; GFX1032-NEXT: .LBB15_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB15_24 +; GFX1032-NEXT: .LBB15_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB15_27 +; GFX1032-NEXT: .LBB15_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB15_30 +; GFX1032-NEXT: .LBB15_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB15_33 +; GFX1032-NEXT: .LBB15_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB15_36 +; GFX1032-NEXT: .LBB15_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB15_39 +; GFX1032-NEXT: .LBB15_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB15_42 +; GFX1032-NEXT: .LBB15_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB15_45 +; GFX1032-NEXT: .LBB15_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB15_48 +; GFX1032-NEXT: .LBB15_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB15_51 +; GFX1032-NEXT: .LBB15_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB15_54 +; GFX1032-NEXT: .LBB15_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB15_57 +; GFX1032-NEXT: .LBB15_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB15_60 +; GFX1032-NEXT: .LBB15_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB15_63 +; GFX1032-NEXT: .LBB15_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB15_66 +; GFX1032-NEXT: .LBB15_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB15_69 +; GFX1032-NEXT: .LBB15_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB15_72 +; GFX1032-NEXT: .LBB15_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB15_75 +; GFX1032-NEXT: .LBB15_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB15_78 +; GFX1032-NEXT: .LBB15_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB15_81 +; GFX1032-NEXT: .LBB15_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB15_84 +; GFX1032-NEXT: .LBB15_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB15_87 +; GFX1032-NEXT: .LBB15_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB15_90 +; GFX1032-NEXT: .LBB15_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1032-NEXT: s_branch .LBB15_93 +; GFX1032-NEXT: .LBB15_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_93: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB15_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1032-NEXT: s_branch .LBB15_96 +; GFX1032-NEXT: .LBB15_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB15_96: +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB15_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB15_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_or_b32 s2, s2, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB15_2: +; GFX1032-NEXT: .LBB15_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: or_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1164-NEXT: s_branch .LBB15_3 +; GFX1164-NEXT: .LBB15_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB15_6 +; GFX1164-NEXT: .LBB15_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB15_9 +; GFX1164-NEXT: .LBB15_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB15_12 +; GFX1164-NEXT: .LBB15_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB15_15 +; GFX1164-NEXT: .LBB15_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB15_18 +; GFX1164-NEXT: .LBB15_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB15_21 +; GFX1164-NEXT: .LBB15_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB15_24 +; GFX1164-NEXT: .LBB15_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB15_27 +; GFX1164-NEXT: .LBB15_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB15_30 +; GFX1164-NEXT: .LBB15_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB15_33 +; GFX1164-NEXT: .LBB15_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB15_36 +; GFX1164-NEXT: .LBB15_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB15_39 +; GFX1164-NEXT: .LBB15_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB15_42 +; GFX1164-NEXT: .LBB15_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB15_45 +; GFX1164-NEXT: .LBB15_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB15_48 +; GFX1164-NEXT: .LBB15_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB15_51 +; GFX1164-NEXT: .LBB15_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB15_54 +; GFX1164-NEXT: .LBB15_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB15_57 +; GFX1164-NEXT: .LBB15_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB15_60 +; GFX1164-NEXT: .LBB15_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB15_63 +; GFX1164-NEXT: .LBB15_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB15_66 +; GFX1164-NEXT: .LBB15_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB15_69 +; GFX1164-NEXT: .LBB15_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB15_72 +; GFX1164-NEXT: .LBB15_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB15_75 +; GFX1164-NEXT: .LBB15_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB15_78 +; GFX1164-NEXT: .LBB15_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB15_81 +; GFX1164-NEXT: .LBB15_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB15_84 +; GFX1164-NEXT: .LBB15_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_or_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB15_87 +; GFX1164-NEXT: .LBB15_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB15_90 +; GFX1164-NEXT: .LBB15_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_or_b32 s4, s6, s2 +; GFX1164-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1164-NEXT: s_mov_b32 s7, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB15_93 +; GFX1164-NEXT: .LBB15_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1164-NEXT: s_or_b32 s4, s4, s2 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1164-NEXT: s_branch .LBB15_96 +; GFX1164-NEXT: .LBB15_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s3, s5, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_or_b32 s6, s4, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB15_99 +; GFX1164-NEXT: .LBB15_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB15_102 +; GFX1164-NEXT: .LBB15_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB15_105 +; GFX1164-NEXT: .LBB15_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB15_108 +; GFX1164-NEXT: .LBB15_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB15_111 +; GFX1164-NEXT: .LBB15_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB15_114 +; GFX1164-NEXT: .LBB15_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB15_117 +; GFX1164-NEXT: .LBB15_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB15_120 +; GFX1164-NEXT: .LBB15_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB15_123 +; GFX1164-NEXT: .LBB15_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB15_126 +; GFX1164-NEXT: .LBB15_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB15_129 +; GFX1164-NEXT: .LBB15_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB15_132 +; GFX1164-NEXT: .LBB15_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB15_135 +; GFX1164-NEXT: .LBB15_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB15_138 +; GFX1164-NEXT: .LBB15_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB15_141 +; GFX1164-NEXT: .LBB15_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB15_144 +; GFX1164-NEXT: .LBB15_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB15_147 +; GFX1164-NEXT: .LBB15_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB15_150 +; GFX1164-NEXT: .LBB15_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB15_153 +; GFX1164-NEXT: .LBB15_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB15_156 +; GFX1164-NEXT: .LBB15_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB15_159 +; GFX1164-NEXT: .LBB15_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB15_162 +; GFX1164-NEXT: .LBB15_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB15_165 +; GFX1164-NEXT: .LBB15_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB15_168 +; GFX1164-NEXT: .LBB15_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB15_171 +; GFX1164-NEXT: .LBB15_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB15_174 +; GFX1164-NEXT: .LBB15_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB15_177 +; GFX1164-NEXT: .LBB15_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB15_180 +; GFX1164-NEXT: .LBB15_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB15_183 +; GFX1164-NEXT: .LBB15_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_or_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1164-NEXT: s_branch .LBB15_186 +; GFX1164-NEXT: .LBB15_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1164-NEXT: s_branch .LBB15_189 +; GFX1164-NEXT: .LBB15_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_or_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB15_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1164-NEXT: s_branch .LBB15_192 +; GFX1164-NEXT: .LBB15_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB15_192: +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB15_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB15_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_or_b32 s4, s6, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1164-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB15_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB15_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3964,53 +24841,510 @@ ; ; GFX1132-LABEL: or_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1132-NEXT: s_branch .LBB15_3 +; GFX1132-NEXT: .LBB15_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB15_6 +; GFX1132-NEXT: .LBB15_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB15_9 +; GFX1132-NEXT: .LBB15_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB15_12 +; GFX1132-NEXT: .LBB15_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB15_15 +; GFX1132-NEXT: .LBB15_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB15_18 +; GFX1132-NEXT: .LBB15_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB15_21 +; GFX1132-NEXT: .LBB15_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB15_24 +; GFX1132-NEXT: .LBB15_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB15_27 +; GFX1132-NEXT: .LBB15_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB15_30 +; GFX1132-NEXT: .LBB15_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB15_33 +; GFX1132-NEXT: .LBB15_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB15_36 +; GFX1132-NEXT: .LBB15_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB15_39 +; GFX1132-NEXT: .LBB15_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB15_42 +; GFX1132-NEXT: .LBB15_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB15_45 +; GFX1132-NEXT: .LBB15_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB15_48 +; GFX1132-NEXT: .LBB15_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB15_51 +; GFX1132-NEXT: .LBB15_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB15_54 +; GFX1132-NEXT: .LBB15_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB15_57 +; GFX1132-NEXT: .LBB15_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB15_60 +; GFX1132-NEXT: .LBB15_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB15_63 +; GFX1132-NEXT: .LBB15_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB15_66 +; GFX1132-NEXT: .LBB15_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB15_69 +; GFX1132-NEXT: .LBB15_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB15_72 +; GFX1132-NEXT: .LBB15_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB15_75 +; GFX1132-NEXT: .LBB15_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB15_78 +; GFX1132-NEXT: .LBB15_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB15_81 +; GFX1132-NEXT: .LBB15_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB15_84 +; GFX1132-NEXT: .LBB15_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB15_87 +; GFX1132-NEXT: .LBB15_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB15_90 +; GFX1132-NEXT: .LBB15_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1132-NEXT: s_branch .LBB15_93 +; GFX1132-NEXT: .LBB15_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_93: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB15_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1132-NEXT: s_branch .LBB15_96 +; GFX1132-NEXT: .LBB15_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB15_96: +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB15_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB15_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_or_b32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_or_rtn_b32 v0, v4, v0 +; GFX1132-NEXT: ds_or_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB15_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB15_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4040,269 +25374,4527 @@ ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_2 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB16_3 +; GFX8-NEXT: .LBB16_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB16_6 +; GFX8-NEXT: .LBB16_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB16_9 +; GFX8-NEXT: .LBB16_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB16_12 +; GFX8-NEXT: .LBB16_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB16_15 +; GFX8-NEXT: .LBB16_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB16_18 +; GFX8-NEXT: .LBB16_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB16_21 +; GFX8-NEXT: .LBB16_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB16_24 +; GFX8-NEXT: .LBB16_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB16_27 +; GFX8-NEXT: .LBB16_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB16_30 +; GFX8-NEXT: .LBB16_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB16_33 +; GFX8-NEXT: .LBB16_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB16_36 +; GFX8-NEXT: .LBB16_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB16_39 +; GFX8-NEXT: .LBB16_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB16_42 +; GFX8-NEXT: .LBB16_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB16_45 +; GFX8-NEXT: .LBB16_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB16_48 +; GFX8-NEXT: .LBB16_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB16_51 +; GFX8-NEXT: .LBB16_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB16_54 +; GFX8-NEXT: .LBB16_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB16_57 +; GFX8-NEXT: .LBB16_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB16_60 +; GFX8-NEXT: .LBB16_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB16_63 +; GFX8-NEXT: .LBB16_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB16_66 +; GFX8-NEXT: .LBB16_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB16_69 +; GFX8-NEXT: .LBB16_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB16_72 +; GFX8-NEXT: .LBB16_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB16_75 +; GFX8-NEXT: .LBB16_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB16_78 +; GFX8-NEXT: .LBB16_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB16_81 +; GFX8-NEXT: .LBB16_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB16_84 +; GFX8-NEXT: .LBB16_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB16_87 +; GFX8-NEXT: .LBB16_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB16_90 +; GFX8-NEXT: .LBB16_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_xor_b32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB16_93 +; GFX8-NEXT: .LBB16_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_xor_b32 s3, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s6, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s3, 31 +; GFX8-NEXT: s_branch .LBB16_96 +; GFX8-NEXT: .LBB16_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s6, 0 +; GFX8-NEXT: s_xor_b32 s6, s3, s4 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB16_99 +; GFX8-NEXT: .LBB16_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB16_102 +; GFX8-NEXT: .LBB16_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB16_105 +; GFX8-NEXT: .LBB16_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB16_108 +; GFX8-NEXT: .LBB16_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB16_111 +; GFX8-NEXT: .LBB16_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB16_114 +; GFX8-NEXT: .LBB16_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB16_117 +; GFX8-NEXT: .LBB16_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB16_120 +; GFX8-NEXT: .LBB16_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB16_123 +; GFX8-NEXT: .LBB16_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB16_126 +; GFX8-NEXT: .LBB16_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB16_129 +; GFX8-NEXT: .LBB16_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB16_132 +; GFX8-NEXT: .LBB16_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB16_135 +; GFX8-NEXT: .LBB16_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB16_138 +; GFX8-NEXT: .LBB16_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB16_141 +; GFX8-NEXT: .LBB16_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB16_144 +; GFX8-NEXT: .LBB16_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB16_147 +; GFX8-NEXT: .LBB16_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB16_150 +; GFX8-NEXT: .LBB16_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB16_153 +; GFX8-NEXT: .LBB16_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB16_156 +; GFX8-NEXT: .LBB16_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB16_159 +; GFX8-NEXT: .LBB16_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB16_162 +; GFX8-NEXT: .LBB16_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB16_165 +; GFX8-NEXT: .LBB16_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB16_168 +; GFX8-NEXT: .LBB16_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB16_171 +; GFX8-NEXT: .LBB16_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB16_174 +; GFX8-NEXT: .LBB16_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB16_177 +; GFX8-NEXT: .LBB16_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB16_180 +; GFX8-NEXT: .LBB16_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB16_183 +; GFX8-NEXT: .LBB16_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB16_186 +; GFX8-NEXT: .LBB16_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB16_189 +; GFX8-NEXT: .LBB16_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_xor_b32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB16_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB16_192 +; GFX8-NEXT: .LBB16_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB16_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB16_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_cbranch_execz .LBB16_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_xor_b32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB16_2: +; GFX8-NEXT: .LBB16_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB16_3 +; GFX9-NEXT: .LBB16_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB16_6 +; GFX9-NEXT: .LBB16_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB16_9 +; GFX9-NEXT: .LBB16_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB16_12 +; GFX9-NEXT: .LBB16_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB16_15 +; GFX9-NEXT: .LBB16_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB16_18 +; GFX9-NEXT: .LBB16_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB16_21 +; GFX9-NEXT: .LBB16_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB16_24 +; GFX9-NEXT: .LBB16_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB16_27 +; GFX9-NEXT: .LBB16_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB16_30 +; GFX9-NEXT: .LBB16_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB16_33 +; GFX9-NEXT: .LBB16_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB16_36 +; GFX9-NEXT: .LBB16_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB16_39 +; GFX9-NEXT: .LBB16_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB16_42 +; GFX9-NEXT: .LBB16_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB16_45 +; GFX9-NEXT: .LBB16_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB16_48 +; GFX9-NEXT: .LBB16_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB16_51 +; GFX9-NEXT: .LBB16_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB16_54 +; GFX9-NEXT: .LBB16_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB16_57 +; GFX9-NEXT: .LBB16_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB16_60 +; GFX9-NEXT: .LBB16_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB16_63 +; GFX9-NEXT: .LBB16_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB16_66 +; GFX9-NEXT: .LBB16_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB16_69 +; GFX9-NEXT: .LBB16_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB16_72 +; GFX9-NEXT: .LBB16_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB16_75 +; GFX9-NEXT: .LBB16_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB16_78 +; GFX9-NEXT: .LBB16_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB16_81 +; GFX9-NEXT: .LBB16_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB16_84 +; GFX9-NEXT: .LBB16_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB16_87 +; GFX9-NEXT: .LBB16_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB16_90 +; GFX9-NEXT: .LBB16_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_xor_b32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB16_93 +; GFX9-NEXT: .LBB16_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_xor_b32 s3, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s6, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s3, 31 +; GFX9-NEXT: s_branch .LBB16_96 +; GFX9-NEXT: .LBB16_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s6, 0 +; GFX9-NEXT: s_xor_b32 s6, s3, s4 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB16_99 +; GFX9-NEXT: .LBB16_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB16_102 +; GFX9-NEXT: .LBB16_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB16_105 +; GFX9-NEXT: .LBB16_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB16_108 +; GFX9-NEXT: .LBB16_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB16_111 +; GFX9-NEXT: .LBB16_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB16_114 +; GFX9-NEXT: .LBB16_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB16_117 +; GFX9-NEXT: .LBB16_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB16_120 +; GFX9-NEXT: .LBB16_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB16_123 +; GFX9-NEXT: .LBB16_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB16_126 +; GFX9-NEXT: .LBB16_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB16_129 +; GFX9-NEXT: .LBB16_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB16_132 +; GFX9-NEXT: .LBB16_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB16_135 +; GFX9-NEXT: .LBB16_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB16_138 +; GFX9-NEXT: .LBB16_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB16_141 +; GFX9-NEXT: .LBB16_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB16_144 +; GFX9-NEXT: .LBB16_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB16_147 +; GFX9-NEXT: .LBB16_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB16_150 +; GFX9-NEXT: .LBB16_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB16_153 +; GFX9-NEXT: .LBB16_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB16_156 +; GFX9-NEXT: .LBB16_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB16_159 +; GFX9-NEXT: .LBB16_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB16_162 +; GFX9-NEXT: .LBB16_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB16_165 +; GFX9-NEXT: .LBB16_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB16_168 +; GFX9-NEXT: .LBB16_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB16_171 +; GFX9-NEXT: .LBB16_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB16_174 +; GFX9-NEXT: .LBB16_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB16_177 +; GFX9-NEXT: .LBB16_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB16_180 +; GFX9-NEXT: .LBB16_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB16_183 +; GFX9-NEXT: .LBB16_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB16_186 +; GFX9-NEXT: .LBB16_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB16_189 +; GFX9-NEXT: .LBB16_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_xor_b32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB16_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB16_192 +; GFX9-NEXT: .LBB16_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB16_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB16_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_cbranch_execz .LBB16_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_xor_b32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_xor_rtn_b32 v0, v3, v0 +; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB16_2: +; GFX9-NEXT: .LBB16_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1064-NEXT: s_branch .LBB16_3 +; GFX1064-NEXT: .LBB16_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB16_6 +; GFX1064-NEXT: .LBB16_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB16_9 +; GFX1064-NEXT: .LBB16_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB16_12 +; GFX1064-NEXT: .LBB16_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB16_15 +; GFX1064-NEXT: .LBB16_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB16_18 +; GFX1064-NEXT: .LBB16_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB16_21 +; GFX1064-NEXT: .LBB16_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB16_24 +; GFX1064-NEXT: .LBB16_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB16_27 +; GFX1064-NEXT: .LBB16_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB16_30 +; GFX1064-NEXT: .LBB16_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB16_33 +; GFX1064-NEXT: .LBB16_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB16_36 +; GFX1064-NEXT: .LBB16_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB16_39 +; GFX1064-NEXT: .LBB16_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB16_42 +; GFX1064-NEXT: .LBB16_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB16_45 +; GFX1064-NEXT: .LBB16_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB16_48 +; GFX1064-NEXT: .LBB16_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB16_51 +; GFX1064-NEXT: .LBB16_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB16_54 +; GFX1064-NEXT: .LBB16_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB16_57 +; GFX1064-NEXT: .LBB16_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB16_60 +; GFX1064-NEXT: .LBB16_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB16_63 +; GFX1064-NEXT: .LBB16_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB16_66 +; GFX1064-NEXT: .LBB16_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB16_69 +; GFX1064-NEXT: .LBB16_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB16_72 +; GFX1064-NEXT: .LBB16_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB16_75 +; GFX1064-NEXT: .LBB16_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB16_78 +; GFX1064-NEXT: .LBB16_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB16_81 +; GFX1064-NEXT: .LBB16_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB16_84 +; GFX1064-NEXT: .LBB16_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_xor_b32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB16_87 +; GFX1064-NEXT: .LBB16_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB16_90 +; GFX1064-NEXT: .LBB16_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_xor_b32 s4, s6, s2 +; GFX1064-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB16_93 +; GFX1064-NEXT: .LBB16_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1064-NEXT: s_xor_b32 s4, s4, s2 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1064-NEXT: s_branch .LBB16_96 +; GFX1064-NEXT: .LBB16_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s3, s5, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_xor_b32 s6, s4, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB16_99 +; GFX1064-NEXT: .LBB16_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB16_102 +; GFX1064-NEXT: .LBB16_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB16_105 +; GFX1064-NEXT: .LBB16_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB16_108 +; GFX1064-NEXT: .LBB16_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB16_111 +; GFX1064-NEXT: .LBB16_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB16_114 +; GFX1064-NEXT: .LBB16_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB16_117 +; GFX1064-NEXT: .LBB16_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB16_120 +; GFX1064-NEXT: .LBB16_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB16_123 +; GFX1064-NEXT: .LBB16_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB16_126 +; GFX1064-NEXT: .LBB16_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB16_129 +; GFX1064-NEXT: .LBB16_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB16_132 +; GFX1064-NEXT: .LBB16_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB16_135 +; GFX1064-NEXT: .LBB16_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB16_138 +; GFX1064-NEXT: .LBB16_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB16_141 +; GFX1064-NEXT: .LBB16_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB16_144 +; GFX1064-NEXT: .LBB16_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB16_147 +; GFX1064-NEXT: .LBB16_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB16_150 +; GFX1064-NEXT: .LBB16_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB16_153 +; GFX1064-NEXT: .LBB16_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB16_156 +; GFX1064-NEXT: .LBB16_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB16_159 +; GFX1064-NEXT: .LBB16_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB16_162 +; GFX1064-NEXT: .LBB16_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB16_165 +; GFX1064-NEXT: .LBB16_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB16_168 +; GFX1064-NEXT: .LBB16_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB16_171 +; GFX1064-NEXT: .LBB16_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB16_174 +; GFX1064-NEXT: .LBB16_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB16_177 +; GFX1064-NEXT: .LBB16_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB16_180 +; GFX1064-NEXT: .LBB16_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB16_183 +; GFX1064-NEXT: .LBB16_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_xor_b32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1064-NEXT: s_branch .LBB16_186 +; GFX1064-NEXT: .LBB16_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1064-NEXT: s_branch .LBB16_189 +; GFX1064-NEXT: .LBB16_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_xor_b32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB16_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1064-NEXT: s_branch .LBB16_192 +; GFX1064-NEXT: .LBB16_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB16_192: +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB16_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB16_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: s_xor_b32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB16_2: +; GFX1064-NEXT: .LBB16_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1032-NEXT: s_branch .LBB16_3 +; GFX1032-NEXT: .LBB16_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB16_6 +; GFX1032-NEXT: .LBB16_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB16_9 +; GFX1032-NEXT: .LBB16_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB16_12 +; GFX1032-NEXT: .LBB16_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB16_15 +; GFX1032-NEXT: .LBB16_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB16_18 +; GFX1032-NEXT: .LBB16_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB16_21 +; GFX1032-NEXT: .LBB16_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB16_24 +; GFX1032-NEXT: .LBB16_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB16_27 +; GFX1032-NEXT: .LBB16_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB16_30 +; GFX1032-NEXT: .LBB16_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB16_33 +; GFX1032-NEXT: .LBB16_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB16_36 +; GFX1032-NEXT: .LBB16_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB16_39 +; GFX1032-NEXT: .LBB16_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB16_42 +; GFX1032-NEXT: .LBB16_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB16_45 +; GFX1032-NEXT: .LBB16_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB16_48 +; GFX1032-NEXT: .LBB16_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB16_51 +; GFX1032-NEXT: .LBB16_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB16_54 +; GFX1032-NEXT: .LBB16_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB16_57 +; GFX1032-NEXT: .LBB16_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB16_60 +; GFX1032-NEXT: .LBB16_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB16_63 +; GFX1032-NEXT: .LBB16_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB16_66 +; GFX1032-NEXT: .LBB16_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB16_69 +; GFX1032-NEXT: .LBB16_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB16_72 +; GFX1032-NEXT: .LBB16_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB16_75 +; GFX1032-NEXT: .LBB16_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB16_78 +; GFX1032-NEXT: .LBB16_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB16_81 +; GFX1032-NEXT: .LBB16_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB16_84 +; GFX1032-NEXT: .LBB16_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB16_87 +; GFX1032-NEXT: .LBB16_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB16_90 +; GFX1032-NEXT: .LBB16_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1032-NEXT: s_branch .LBB16_93 +; GFX1032-NEXT: .LBB16_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_93: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB16_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1032-NEXT: s_branch .LBB16_96 +; GFX1032-NEXT: .LBB16_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB16_96: +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB16_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB16_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_xor_b32 s2, s2, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB16_2: +; GFX1032-NEXT: .LBB16_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -; -; GFX1164-LABEL: xor_i32_varying: -; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB16_2 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: xor_i32_varying: +; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1164-NEXT: s_branch .LBB16_3 +; GFX1164-NEXT: .LBB16_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB16_6 +; GFX1164-NEXT: .LBB16_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB16_9 +; GFX1164-NEXT: .LBB16_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB16_12 +; GFX1164-NEXT: .LBB16_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB16_15 +; GFX1164-NEXT: .LBB16_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB16_18 +; GFX1164-NEXT: .LBB16_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB16_21 +; GFX1164-NEXT: .LBB16_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB16_24 +; GFX1164-NEXT: .LBB16_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB16_27 +; GFX1164-NEXT: .LBB16_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB16_30 +; GFX1164-NEXT: .LBB16_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB16_33 +; GFX1164-NEXT: .LBB16_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB16_36 +; GFX1164-NEXT: .LBB16_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB16_39 +; GFX1164-NEXT: .LBB16_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB16_42 +; GFX1164-NEXT: .LBB16_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB16_45 +; GFX1164-NEXT: .LBB16_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB16_48 +; GFX1164-NEXT: .LBB16_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB16_51 +; GFX1164-NEXT: .LBB16_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB16_54 +; GFX1164-NEXT: .LBB16_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB16_57 +; GFX1164-NEXT: .LBB16_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB16_60 +; GFX1164-NEXT: .LBB16_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB16_63 +; GFX1164-NEXT: .LBB16_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB16_66 +; GFX1164-NEXT: .LBB16_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB16_69 +; GFX1164-NEXT: .LBB16_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB16_72 +; GFX1164-NEXT: .LBB16_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB16_75 +; GFX1164-NEXT: .LBB16_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB16_78 +; GFX1164-NEXT: .LBB16_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB16_81 +; GFX1164-NEXT: .LBB16_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB16_84 +; GFX1164-NEXT: .LBB16_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_xor_b32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB16_87 +; GFX1164-NEXT: .LBB16_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB16_90 +; GFX1164-NEXT: .LBB16_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_xor_b32 s4, s6, s2 +; GFX1164-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1164-NEXT: s_mov_b32 s7, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB16_93 +; GFX1164-NEXT: .LBB16_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1164-NEXT: s_xor_b32 s4, s4, s2 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1164-NEXT: s_branch .LBB16_96 +; GFX1164-NEXT: .LBB16_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s3, s5, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_xor_b32 s6, s4, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB16_99 +; GFX1164-NEXT: .LBB16_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB16_102 +; GFX1164-NEXT: .LBB16_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB16_105 +; GFX1164-NEXT: .LBB16_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB16_108 +; GFX1164-NEXT: .LBB16_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB16_111 +; GFX1164-NEXT: .LBB16_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB16_114 +; GFX1164-NEXT: .LBB16_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB16_117 +; GFX1164-NEXT: .LBB16_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB16_120 +; GFX1164-NEXT: .LBB16_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB16_123 +; GFX1164-NEXT: .LBB16_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB16_126 +; GFX1164-NEXT: .LBB16_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB16_129 +; GFX1164-NEXT: .LBB16_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB16_132 +; GFX1164-NEXT: .LBB16_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB16_135 +; GFX1164-NEXT: .LBB16_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB16_138 +; GFX1164-NEXT: .LBB16_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB16_141 +; GFX1164-NEXT: .LBB16_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB16_144 +; GFX1164-NEXT: .LBB16_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB16_147 +; GFX1164-NEXT: .LBB16_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB16_150 +; GFX1164-NEXT: .LBB16_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB16_153 +; GFX1164-NEXT: .LBB16_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB16_156 +; GFX1164-NEXT: .LBB16_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB16_159 +; GFX1164-NEXT: .LBB16_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB16_162 +; GFX1164-NEXT: .LBB16_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB16_165 +; GFX1164-NEXT: .LBB16_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB16_168 +; GFX1164-NEXT: .LBB16_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB16_171 +; GFX1164-NEXT: .LBB16_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB16_174 +; GFX1164-NEXT: .LBB16_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB16_177 +; GFX1164-NEXT: .LBB16_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB16_180 +; GFX1164-NEXT: .LBB16_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB16_183 +; GFX1164-NEXT: .LBB16_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_xor_b32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1164-NEXT: s_branch .LBB16_186 +; GFX1164-NEXT: .LBB16_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1164-NEXT: s_branch .LBB16_189 +; GFX1164-NEXT: .LBB16_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_xor_b32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB16_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1164-NEXT: s_branch .LBB16_192 +; GFX1164-NEXT: .LBB16_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB16_192: +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 +; GFX1164-NEXT: ; implicit-def: $vgpr0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB16_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_xor_b32 s4, s6, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1164-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB16_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB16_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4310,53 +29902,510 @@ ; ; GFX1132-LABEL: xor_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1132-NEXT: s_branch .LBB16_3 +; GFX1132-NEXT: .LBB16_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB16_6 +; GFX1132-NEXT: .LBB16_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB16_9 +; GFX1132-NEXT: .LBB16_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB16_12 +; GFX1132-NEXT: .LBB16_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB16_15 +; GFX1132-NEXT: .LBB16_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB16_18 +; GFX1132-NEXT: .LBB16_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB16_21 +; GFX1132-NEXT: .LBB16_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB16_24 +; GFX1132-NEXT: .LBB16_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB16_27 +; GFX1132-NEXT: .LBB16_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB16_30 +; GFX1132-NEXT: .LBB16_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB16_33 +; GFX1132-NEXT: .LBB16_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB16_36 +; GFX1132-NEXT: .LBB16_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB16_39 +; GFX1132-NEXT: .LBB16_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB16_42 +; GFX1132-NEXT: .LBB16_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB16_45 +; GFX1132-NEXT: .LBB16_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB16_48 +; GFX1132-NEXT: .LBB16_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB16_51 +; GFX1132-NEXT: .LBB16_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB16_54 +; GFX1132-NEXT: .LBB16_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB16_57 +; GFX1132-NEXT: .LBB16_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB16_60 +; GFX1132-NEXT: .LBB16_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB16_63 +; GFX1132-NEXT: .LBB16_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB16_66 +; GFX1132-NEXT: .LBB16_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB16_69 +; GFX1132-NEXT: .LBB16_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB16_72 +; GFX1132-NEXT: .LBB16_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB16_75 +; GFX1132-NEXT: .LBB16_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB16_78 +; GFX1132-NEXT: .LBB16_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB16_81 +; GFX1132-NEXT: .LBB16_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB16_84 +; GFX1132-NEXT: .LBB16_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB16_87 +; GFX1132-NEXT: .LBB16_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB16_90 +; GFX1132-NEXT: .LBB16_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1132-NEXT: s_branch .LBB16_93 +; GFX1132-NEXT: .LBB16_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_93: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB16_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1132-NEXT: s_branch .LBB16_96 +; GFX1132-NEXT: .LBB16_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB16_96: +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB16_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB16_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_xor_b32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_xor_rtn_b32 v0, v4, v0 +; GFX1132-NEXT: ds_xor_rtn_b32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB16_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB16_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4386,273 +30435,4568 @@ ; ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_brev_b32 s6, 1 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, s6, 0 +; GFX8-NEXT: s_branch .LBB17_3 +; GFX8-NEXT: .LBB17_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s2, 0x80000000 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB17_6 +; GFX8-NEXT: .LBB17_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB17_9 +; GFX8-NEXT: .LBB17_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB17_12 +; GFX8-NEXT: .LBB17_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB17_15 +; GFX8-NEXT: .LBB17_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB17_18 +; GFX8-NEXT: .LBB17_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB17_21 +; GFX8-NEXT: .LBB17_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB17_24 +; GFX8-NEXT: .LBB17_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB17_27 +; GFX8-NEXT: .LBB17_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB17_30 +; GFX8-NEXT: .LBB17_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB17_33 +; GFX8-NEXT: .LBB17_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB17_36 +; GFX8-NEXT: .LBB17_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB17_39 +; GFX8-NEXT: .LBB17_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB17_42 +; GFX8-NEXT: .LBB17_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB17_45 +; GFX8-NEXT: .LBB17_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB17_48 +; GFX8-NEXT: .LBB17_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB17_51 +; GFX8-NEXT: .LBB17_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB17_54 +; GFX8-NEXT: .LBB17_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB17_57 +; GFX8-NEXT: .LBB17_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB17_60 +; GFX8-NEXT: .LBB17_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB17_63 +; GFX8-NEXT: .LBB17_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB17_66 +; GFX8-NEXT: .LBB17_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB17_69 +; GFX8-NEXT: .LBB17_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB17_72 +; GFX8-NEXT: .LBB17_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB17_75 +; GFX8-NEXT: .LBB17_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB17_78 +; GFX8-NEXT: .LBB17_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB17_81 +; GFX8-NEXT: .LBB17_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB17_84 +; GFX8-NEXT: .LBB17_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB17_87 +; GFX8-NEXT: .LBB17_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB17_90 +; GFX8-NEXT: .LBB17_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB17_93 +; GFX8-NEXT: .LBB17_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0x80000000 +; GFX8-NEXT: s_max_i32 s4, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s5, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s4, 31 +; GFX8-NEXT: s_branch .LBB17_96 +; GFX8-NEXT: .LBB17_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_96: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s4, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB17_99 +; GFX8-NEXT: .LBB17_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB17_102 +; GFX8-NEXT: .LBB17_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB17_105 +; GFX8-NEXT: .LBB17_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB17_108 +; GFX8-NEXT: .LBB17_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB17_111 +; GFX8-NEXT: .LBB17_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB17_114 +; GFX8-NEXT: .LBB17_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB17_117 +; GFX8-NEXT: .LBB17_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB17_120 +; GFX8-NEXT: .LBB17_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB17_123 +; GFX8-NEXT: .LBB17_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB17_126 +; GFX8-NEXT: .LBB17_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB17_129 +; GFX8-NEXT: .LBB17_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB17_132 +; GFX8-NEXT: .LBB17_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB17_135 +; GFX8-NEXT: .LBB17_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB17_138 +; GFX8-NEXT: .LBB17_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB17_141 +; GFX8-NEXT: .LBB17_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB17_144 +; GFX8-NEXT: .LBB17_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB17_147 +; GFX8-NEXT: .LBB17_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB17_150 +; GFX8-NEXT: .LBB17_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB17_153 +; GFX8-NEXT: .LBB17_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB17_156 +; GFX8-NEXT: .LBB17_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB17_159 +; GFX8-NEXT: .LBB17_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB17_162 +; GFX8-NEXT: .LBB17_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB17_165 +; GFX8-NEXT: .LBB17_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB17_168 +; GFX8-NEXT: .LBB17_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB17_171 +; GFX8-NEXT: .LBB17_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB17_174 +; GFX8-NEXT: .LBB17_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB17_177 +; GFX8-NEXT: .LBB17_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB17_180 +; GFX8-NEXT: .LBB17_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB17_183 +; GFX8-NEXT: .LBB17_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB17_186 +; GFX8-NEXT: .LBB17_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB17_189 +; GFX8-NEXT: .LBB17_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX8-NEXT: s_max_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB17_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB17_192 +; GFX8-NEXT: .LBB17_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB17_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB17_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_cbranch_execz .LBB17_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB17_2: +; GFX8-NEXT: .LBB17_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX8-NEXT: v_max_i32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_brev_b32 s6, 1 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_writelane_b32 v1, s6, 0 +; GFX9-NEXT: s_branch .LBB17_3 +; GFX9-NEXT: .LBB17_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s2, 0x80000000 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB17_6 +; GFX9-NEXT: .LBB17_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB17_9 +; GFX9-NEXT: .LBB17_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB17_12 +; GFX9-NEXT: .LBB17_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB17_15 +; GFX9-NEXT: .LBB17_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB17_18 +; GFX9-NEXT: .LBB17_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB17_21 +; GFX9-NEXT: .LBB17_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB17_24 +; GFX9-NEXT: .LBB17_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB17_27 +; GFX9-NEXT: .LBB17_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB17_30 +; GFX9-NEXT: .LBB17_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB17_33 +; GFX9-NEXT: .LBB17_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB17_36 +; GFX9-NEXT: .LBB17_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB17_39 +; GFX9-NEXT: .LBB17_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB17_42 +; GFX9-NEXT: .LBB17_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB17_45 +; GFX9-NEXT: .LBB17_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB17_48 +; GFX9-NEXT: .LBB17_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB17_51 +; GFX9-NEXT: .LBB17_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB17_54 +; GFX9-NEXT: .LBB17_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB17_57 +; GFX9-NEXT: .LBB17_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB17_60 +; GFX9-NEXT: .LBB17_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB17_63 +; GFX9-NEXT: .LBB17_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB17_66 +; GFX9-NEXT: .LBB17_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB17_69 +; GFX9-NEXT: .LBB17_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB17_72 +; GFX9-NEXT: .LBB17_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB17_75 +; GFX9-NEXT: .LBB17_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB17_78 +; GFX9-NEXT: .LBB17_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB17_81 +; GFX9-NEXT: .LBB17_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB17_84 +; GFX9-NEXT: .LBB17_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB17_87 +; GFX9-NEXT: .LBB17_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB17_90 +; GFX9-NEXT: .LBB17_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB17_93 +; GFX9-NEXT: .LBB17_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0x80000000 +; GFX9-NEXT: s_max_i32 s4, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s5, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s4, 31 +; GFX9-NEXT: s_branch .LBB17_96 +; GFX9-NEXT: .LBB17_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_96: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s4, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB17_99 +; GFX9-NEXT: .LBB17_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB17_102 +; GFX9-NEXT: .LBB17_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB17_105 +; GFX9-NEXT: .LBB17_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB17_108 +; GFX9-NEXT: .LBB17_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB17_111 +; GFX9-NEXT: .LBB17_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB17_114 +; GFX9-NEXT: .LBB17_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB17_117 +; GFX9-NEXT: .LBB17_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB17_120 +; GFX9-NEXT: .LBB17_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB17_123 +; GFX9-NEXT: .LBB17_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB17_126 +; GFX9-NEXT: .LBB17_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB17_129 +; GFX9-NEXT: .LBB17_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB17_132 +; GFX9-NEXT: .LBB17_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB17_135 +; GFX9-NEXT: .LBB17_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB17_138 +; GFX9-NEXT: .LBB17_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB17_141 +; GFX9-NEXT: .LBB17_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB17_144 +; GFX9-NEXT: .LBB17_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB17_147 +; GFX9-NEXT: .LBB17_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB17_150 +; GFX9-NEXT: .LBB17_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB17_153 +; GFX9-NEXT: .LBB17_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB17_156 +; GFX9-NEXT: .LBB17_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB17_159 +; GFX9-NEXT: .LBB17_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB17_162 +; GFX9-NEXT: .LBB17_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB17_165 +; GFX9-NEXT: .LBB17_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB17_168 +; GFX9-NEXT: .LBB17_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB17_171 +; GFX9-NEXT: .LBB17_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB17_174 +; GFX9-NEXT: .LBB17_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB17_177 +; GFX9-NEXT: .LBB17_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB17_180 +; GFX9-NEXT: .LBB17_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB17_183 +; GFX9-NEXT: .LBB17_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB17_186 +; GFX9-NEXT: .LBB17_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB17_189 +; GFX9-NEXT: .LBB17_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX9-NEXT: s_max_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB17_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB17_192 +; GFX9-NEXT: .LBB17_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB17_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB17_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_cbranch_execz .LBB17_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX9-NEXT: s_max_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 +; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB17_2: +; GFX9-NEXT: .LBB17_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_i32_e32 v0, s4, v0 +; GFX9-NEXT: v_max_i32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: s_brev_b32 s6, 1 +; GFX1064-NEXT: v_writelane_b32 v1, s6, 0 +; GFX1064-NEXT: s_branch .LBB17_3 +; GFX1064-NEXT: .LBB17_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s2, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: s_max_i32 s6, s4, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB17_6 +; GFX1064-NEXT: .LBB17_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB17_9 +; GFX1064-NEXT: .LBB17_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB17_12 +; GFX1064-NEXT: .LBB17_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB17_15 +; GFX1064-NEXT: .LBB17_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB17_18 +; GFX1064-NEXT: .LBB17_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB17_21 +; GFX1064-NEXT: .LBB17_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB17_24 +; GFX1064-NEXT: .LBB17_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB17_27 +; GFX1064-NEXT: .LBB17_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB17_30 +; GFX1064-NEXT: .LBB17_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB17_33 +; GFX1064-NEXT: .LBB17_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB17_36 +; GFX1064-NEXT: .LBB17_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB17_39 +; GFX1064-NEXT: .LBB17_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB17_42 +; GFX1064-NEXT: .LBB17_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB17_45 +; GFX1064-NEXT: .LBB17_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB17_48 +; GFX1064-NEXT: .LBB17_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB17_51 +; GFX1064-NEXT: .LBB17_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB17_54 +; GFX1064-NEXT: .LBB17_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB17_57 +; GFX1064-NEXT: .LBB17_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB17_60 +; GFX1064-NEXT: .LBB17_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB17_63 +; GFX1064-NEXT: .LBB17_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB17_66 +; GFX1064-NEXT: .LBB17_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB17_69 +; GFX1064-NEXT: .LBB17_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB17_72 +; GFX1064-NEXT: .LBB17_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB17_75 +; GFX1064-NEXT: .LBB17_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB17_78 +; GFX1064-NEXT: .LBB17_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB17_81 +; GFX1064-NEXT: .LBB17_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB17_84 +; GFX1064-NEXT: .LBB17_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB17_87 +; GFX1064-NEXT: .LBB17_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB17_90 +; GFX1064-NEXT: .LBB17_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s8, exec_lo, 2.0 +; GFX1064-NEXT: s_max_i32 s4, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_mov_b32 s9, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[8:9], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB17_93 +; GFX1064-NEXT: .LBB17_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1064-NEXT: s_max_i32 s4, s4, s2 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1064-NEXT: s_branch .LBB17_96 +; GFX1064-NEXT: .LBB17_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_max_i32 s6, s4, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB17_99 +; GFX1064-NEXT: .LBB17_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB17_102 +; GFX1064-NEXT: .LBB17_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB17_105 +; GFX1064-NEXT: .LBB17_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB17_108 +; GFX1064-NEXT: .LBB17_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB17_111 +; GFX1064-NEXT: .LBB17_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB17_114 +; GFX1064-NEXT: .LBB17_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB17_117 +; GFX1064-NEXT: .LBB17_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB17_120 +; GFX1064-NEXT: .LBB17_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB17_123 +; GFX1064-NEXT: .LBB17_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB17_126 +; GFX1064-NEXT: .LBB17_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB17_129 +; GFX1064-NEXT: .LBB17_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB17_132 +; GFX1064-NEXT: .LBB17_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB17_135 +; GFX1064-NEXT: .LBB17_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB17_138 +; GFX1064-NEXT: .LBB17_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB17_141 +; GFX1064-NEXT: .LBB17_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB17_144 +; GFX1064-NEXT: .LBB17_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB17_147 +; GFX1064-NEXT: .LBB17_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB17_150 +; GFX1064-NEXT: .LBB17_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB17_153 +; GFX1064-NEXT: .LBB17_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB17_156 +; GFX1064-NEXT: .LBB17_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB17_159 +; GFX1064-NEXT: .LBB17_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB17_162 +; GFX1064-NEXT: .LBB17_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB17_165 +; GFX1064-NEXT: .LBB17_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB17_168 +; GFX1064-NEXT: .LBB17_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB17_171 +; GFX1064-NEXT: .LBB17_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB17_174 +; GFX1064-NEXT: .LBB17_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB17_177 +; GFX1064-NEXT: .LBB17_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB17_180 +; GFX1064-NEXT: .LBB17_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB17_183 +; GFX1064-NEXT: .LBB17_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_max_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1064-NEXT: s_branch .LBB17_186 +; GFX1064-NEXT: .LBB17_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_max_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1064-NEXT: s_branch .LBB17_189 +; GFX1064-NEXT: .LBB17_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_max_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB17_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1064-NEXT: s_branch .LBB17_192 +; GFX1064-NEXT: .LBB17_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB17_192: +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB17_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB17_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x80000000 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_max_i32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB17_2: +; GFX1064-NEXT: .LBB17_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_brev_b32 s4, 1 +; GFX1032-NEXT: v_writelane_b32 v1, s4, 0 +; GFX1032-NEXT: s_branch .LBB17_3 +; GFX1032-NEXT: .LBB17_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: s_max_i32 s2, s2, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB17_6 +; GFX1032-NEXT: .LBB17_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB17_9 +; GFX1032-NEXT: .LBB17_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB17_12 +; GFX1032-NEXT: .LBB17_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB17_15 +; GFX1032-NEXT: .LBB17_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB17_18 +; GFX1032-NEXT: .LBB17_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB17_21 +; GFX1032-NEXT: .LBB17_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB17_24 +; GFX1032-NEXT: .LBB17_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB17_27 +; GFX1032-NEXT: .LBB17_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB17_30 +; GFX1032-NEXT: .LBB17_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB17_33 +; GFX1032-NEXT: .LBB17_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB17_36 +; GFX1032-NEXT: .LBB17_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB17_39 +; GFX1032-NEXT: .LBB17_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB17_42 +; GFX1032-NEXT: .LBB17_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB17_45 +; GFX1032-NEXT: .LBB17_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB17_48 +; GFX1032-NEXT: .LBB17_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB17_51 +; GFX1032-NEXT: .LBB17_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB17_54 +; GFX1032-NEXT: .LBB17_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB17_57 +; GFX1032-NEXT: .LBB17_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB17_60 +; GFX1032-NEXT: .LBB17_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB17_63 +; GFX1032-NEXT: .LBB17_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB17_66 +; GFX1032-NEXT: .LBB17_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB17_69 +; GFX1032-NEXT: .LBB17_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB17_72 +; GFX1032-NEXT: .LBB17_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB17_75 +; GFX1032-NEXT: .LBB17_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB17_78 +; GFX1032-NEXT: .LBB17_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB17_81 +; GFX1032-NEXT: .LBB17_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB17_84 +; GFX1032-NEXT: .LBB17_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB17_87 +; GFX1032-NEXT: .LBB17_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB17_90 +; GFX1032-NEXT: .LBB17_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1032-NEXT: s_branch .LBB17_93 +; GFX1032-NEXT: .LBB17_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_93: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB17_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1032-NEXT: s_branch .LBB17_96 +; GFX1032-NEXT: .LBB17_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB17_96: +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB17_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB17_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x80000000 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: s_max_i32 s2, s2, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB17_2: +; GFX1032-NEXT: .LBB17_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: max_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_brev_b32 s6, 1 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_writelane_b32 v1, s6, 0 +; GFX1164-NEXT: s_branch .LBB17_3 +; GFX1164-NEXT: .LBB17_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s2, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: s_max_i32 s6, s4, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB17_6 +; GFX1164-NEXT: .LBB17_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB17_9 +; GFX1164-NEXT: .LBB17_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB17_12 +; GFX1164-NEXT: .LBB17_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB17_15 +; GFX1164-NEXT: .LBB17_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB17_18 +; GFX1164-NEXT: .LBB17_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB17_21 +; GFX1164-NEXT: .LBB17_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB17_24 +; GFX1164-NEXT: .LBB17_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB17_27 +; GFX1164-NEXT: .LBB17_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB17_30 +; GFX1164-NEXT: .LBB17_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB17_33 +; GFX1164-NEXT: .LBB17_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB17_36 +; GFX1164-NEXT: .LBB17_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB17_39 +; GFX1164-NEXT: .LBB17_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB17_42 +; GFX1164-NEXT: .LBB17_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB17_45 +; GFX1164-NEXT: .LBB17_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB17_48 +; GFX1164-NEXT: .LBB17_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB17_51 +; GFX1164-NEXT: .LBB17_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB17_54 +; GFX1164-NEXT: .LBB17_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB17_57 +; GFX1164-NEXT: .LBB17_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB17_60 +; GFX1164-NEXT: .LBB17_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB17_63 +; GFX1164-NEXT: .LBB17_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB17_66 +; GFX1164-NEXT: .LBB17_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB17_69 +; GFX1164-NEXT: .LBB17_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB17_72 +; GFX1164-NEXT: .LBB17_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB17_75 +; GFX1164-NEXT: .LBB17_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB17_78 +; GFX1164-NEXT: .LBB17_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB17_81 +; GFX1164-NEXT: .LBB17_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB17_84 +; GFX1164-NEXT: .LBB17_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB17_87 +; GFX1164-NEXT: .LBB17_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB17_90 +; GFX1164-NEXT: .LBB17_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s8, exec_lo, 2.0 +; GFX1164-NEXT: s_max_i32 s4, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_mov_b32 s9, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[8:9], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB17_93 +; GFX1164-NEXT: .LBB17_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1164-NEXT: s_max_i32 s4, s4, s2 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1164-NEXT: s_branch .LBB17_96 +; GFX1164-NEXT: .LBB17_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_max_i32 s6, s4, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB17_99 +; GFX1164-NEXT: .LBB17_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB17_102 +; GFX1164-NEXT: .LBB17_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB17_105 +; GFX1164-NEXT: .LBB17_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB17_108 +; GFX1164-NEXT: .LBB17_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB17_111 +; GFX1164-NEXT: .LBB17_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB17_114 +; GFX1164-NEXT: .LBB17_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB17_117 +; GFX1164-NEXT: .LBB17_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB17_120 +; GFX1164-NEXT: .LBB17_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB17_123 +; GFX1164-NEXT: .LBB17_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB17_126 +; GFX1164-NEXT: .LBB17_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB17_129 +; GFX1164-NEXT: .LBB17_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB17_132 +; GFX1164-NEXT: .LBB17_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB17_135 +; GFX1164-NEXT: .LBB17_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB17_138 +; GFX1164-NEXT: .LBB17_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB17_141 +; GFX1164-NEXT: .LBB17_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB17_144 +; GFX1164-NEXT: .LBB17_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB17_147 +; GFX1164-NEXT: .LBB17_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB17_150 +; GFX1164-NEXT: .LBB17_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB17_153 +; GFX1164-NEXT: .LBB17_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB17_156 +; GFX1164-NEXT: .LBB17_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB17_159 +; GFX1164-NEXT: .LBB17_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB17_162 +; GFX1164-NEXT: .LBB17_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB17_165 +; GFX1164-NEXT: .LBB17_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB17_168 +; GFX1164-NEXT: .LBB17_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB17_171 +; GFX1164-NEXT: .LBB17_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB17_174 +; GFX1164-NEXT: .LBB17_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB17_177 +; GFX1164-NEXT: .LBB17_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB17_180 +; GFX1164-NEXT: .LBB17_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB17_183 +; GFX1164-NEXT: .LBB17_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_max_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1164-NEXT: s_branch .LBB17_186 +; GFX1164-NEXT: .LBB17_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x80000000 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_max_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1164-NEXT: s_branch .LBB17_189 +; GFX1164-NEXT: .LBB17_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x80000000 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_max_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB17_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1164-NEXT: s_branch .LBB17_192 +; GFX1164-NEXT: .LBB17_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB17_192: +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB17_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB17_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x80000000 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_max_i32 s4, s6, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1164-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB17_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB17_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4660,53 +35004,513 @@ ; ; GFX1132-LABEL: max_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_bfrev_b32_e32 v3, 1 -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_brev_b32 s4, 1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_writelane_b32 v1, s4, 0 +; GFX1132-NEXT: s_branch .LBB17_3 +; GFX1132-NEXT: .LBB17_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: s_max_i32 s2, s2, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB17_6 +; GFX1132-NEXT: .LBB17_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB17_9 +; GFX1132-NEXT: .LBB17_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB17_12 +; GFX1132-NEXT: .LBB17_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB17_15 +; GFX1132-NEXT: .LBB17_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB17_18 +; GFX1132-NEXT: .LBB17_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB17_21 +; GFX1132-NEXT: .LBB17_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB17_24 +; GFX1132-NEXT: .LBB17_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB17_27 +; GFX1132-NEXT: .LBB17_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB17_30 +; GFX1132-NEXT: .LBB17_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB17_33 +; GFX1132-NEXT: .LBB17_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB17_36 +; GFX1132-NEXT: .LBB17_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB17_39 +; GFX1132-NEXT: .LBB17_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB17_42 +; GFX1132-NEXT: .LBB17_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB17_45 +; GFX1132-NEXT: .LBB17_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB17_48 +; GFX1132-NEXT: .LBB17_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB17_51 +; GFX1132-NEXT: .LBB17_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB17_54 +; GFX1132-NEXT: .LBB17_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB17_57 +; GFX1132-NEXT: .LBB17_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB17_60 +; GFX1132-NEXT: .LBB17_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB17_63 +; GFX1132-NEXT: .LBB17_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB17_66 +; GFX1132-NEXT: .LBB17_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB17_69 +; GFX1132-NEXT: .LBB17_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB17_72 +; GFX1132-NEXT: .LBB17_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB17_75 +; GFX1132-NEXT: .LBB17_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB17_78 +; GFX1132-NEXT: .LBB17_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB17_81 +; GFX1132-NEXT: .LBB17_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB17_84 +; GFX1132-NEXT: .LBB17_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB17_87 +; GFX1132-NEXT: .LBB17_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB17_90 +; GFX1132-NEXT: .LBB17_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1132-NEXT: s_branch .LBB17_93 +; GFX1132-NEXT: .LBB17_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_93: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB17_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1132-NEXT: s_branch .LBB17_96 +; GFX1132-NEXT: .LBB17_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB17_96: +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB17_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB17_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x80000000 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: s_max_i32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v4 +; GFX1132-NEXT: ds_max_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB17_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB17_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4987,273 +35791,4568 @@ ; ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_brev_b32 s6, -2 +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, s6, 0 +; GFX8-NEXT: s_branch .LBB19_3 +; GFX8-NEXT: .LBB19_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s2, 0x7fffffff +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB19_6 +; GFX8-NEXT: .LBB19_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB19_9 +; GFX8-NEXT: .LBB19_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB19_12 +; GFX8-NEXT: .LBB19_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB19_15 +; GFX8-NEXT: .LBB19_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB19_18 +; GFX8-NEXT: .LBB19_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB19_21 +; GFX8-NEXT: .LBB19_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB19_24 +; GFX8-NEXT: .LBB19_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB19_27 +; GFX8-NEXT: .LBB19_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB19_30 +; GFX8-NEXT: .LBB19_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB19_33 +; GFX8-NEXT: .LBB19_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB19_36 +; GFX8-NEXT: .LBB19_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB19_39 +; GFX8-NEXT: .LBB19_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB19_42 +; GFX8-NEXT: .LBB19_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB19_45 +; GFX8-NEXT: .LBB19_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB19_48 +; GFX8-NEXT: .LBB19_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB19_51 +; GFX8-NEXT: .LBB19_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB19_54 +; GFX8-NEXT: .LBB19_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB19_57 +; GFX8-NEXT: .LBB19_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB19_60 +; GFX8-NEXT: .LBB19_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB19_63 +; GFX8-NEXT: .LBB19_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB19_66 +; GFX8-NEXT: .LBB19_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB19_69 +; GFX8-NEXT: .LBB19_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB19_72 +; GFX8-NEXT: .LBB19_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB19_75 +; GFX8-NEXT: .LBB19_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB19_78 +; GFX8-NEXT: .LBB19_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB19_81 +; GFX8-NEXT: .LBB19_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB19_84 +; GFX8-NEXT: .LBB19_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB19_87 +; GFX8-NEXT: .LBB19_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB19_90 +; GFX8-NEXT: .LBB19_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB19_93 +; GFX8-NEXT: .LBB19_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0x7fffffff +; GFX8-NEXT: s_min_i32 s4, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s5, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s4, 31 +; GFX8-NEXT: s_branch .LBB19_96 +; GFX8-NEXT: .LBB19_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_96: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s4, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB19_99 +; GFX8-NEXT: .LBB19_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB19_102 +; GFX8-NEXT: .LBB19_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB19_105 +; GFX8-NEXT: .LBB19_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB19_108 +; GFX8-NEXT: .LBB19_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB19_111 +; GFX8-NEXT: .LBB19_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB19_114 +; GFX8-NEXT: .LBB19_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB19_117 +; GFX8-NEXT: .LBB19_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB19_120 +; GFX8-NEXT: .LBB19_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB19_123 +; GFX8-NEXT: .LBB19_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB19_126 +; GFX8-NEXT: .LBB19_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB19_129 +; GFX8-NEXT: .LBB19_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB19_132 +; GFX8-NEXT: .LBB19_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB19_135 +; GFX8-NEXT: .LBB19_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB19_138 +; GFX8-NEXT: .LBB19_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB19_141 +; GFX8-NEXT: .LBB19_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB19_144 +; GFX8-NEXT: .LBB19_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB19_147 +; GFX8-NEXT: .LBB19_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB19_150 +; GFX8-NEXT: .LBB19_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB19_153 +; GFX8-NEXT: .LBB19_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB19_156 +; GFX8-NEXT: .LBB19_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB19_159 +; GFX8-NEXT: .LBB19_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB19_162 +; GFX8-NEXT: .LBB19_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB19_165 +; GFX8-NEXT: .LBB19_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB19_168 +; GFX8-NEXT: .LBB19_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB19_171 +; GFX8-NEXT: .LBB19_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB19_174 +; GFX8-NEXT: .LBB19_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB19_177 +; GFX8-NEXT: .LBB19_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB19_180 +; GFX8-NEXT: .LBB19_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB19_183 +; GFX8-NEXT: .LBB19_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB19_186 +; GFX8-NEXT: .LBB19_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB19_189 +; GFX8-NEXT: .LBB19_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB19_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB19_192 +; GFX8-NEXT: .LBB19_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB19_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB19_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_cbranch_execz .LBB19_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX8-NEXT: s_min_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB19_2: +; GFX8-NEXT: .LBB19_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX8-NEXT: v_min_i32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_brev_b32 s6, -2 +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_writelane_b32 v1, s6, 0 +; GFX9-NEXT: s_branch .LBB19_3 +; GFX9-NEXT: .LBB19_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s2, 0x7fffffff +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB19_6 +; GFX9-NEXT: .LBB19_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB19_9 +; GFX9-NEXT: .LBB19_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB19_12 +; GFX9-NEXT: .LBB19_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB19_15 +; GFX9-NEXT: .LBB19_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB19_18 +; GFX9-NEXT: .LBB19_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB19_21 +; GFX9-NEXT: .LBB19_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB19_24 +; GFX9-NEXT: .LBB19_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB19_27 +; GFX9-NEXT: .LBB19_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB19_30 +; GFX9-NEXT: .LBB19_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB19_33 +; GFX9-NEXT: .LBB19_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB19_36 +; GFX9-NEXT: .LBB19_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB19_39 +; GFX9-NEXT: .LBB19_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB19_42 +; GFX9-NEXT: .LBB19_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB19_45 +; GFX9-NEXT: .LBB19_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB19_48 +; GFX9-NEXT: .LBB19_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB19_51 +; GFX9-NEXT: .LBB19_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB19_54 +; GFX9-NEXT: .LBB19_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB19_57 +; GFX9-NEXT: .LBB19_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB19_60 +; GFX9-NEXT: .LBB19_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB19_63 +; GFX9-NEXT: .LBB19_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB19_66 +; GFX9-NEXT: .LBB19_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB19_69 +; GFX9-NEXT: .LBB19_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB19_72 +; GFX9-NEXT: .LBB19_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB19_75 +; GFX9-NEXT: .LBB19_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB19_78 +; GFX9-NEXT: .LBB19_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB19_81 +; GFX9-NEXT: .LBB19_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB19_84 +; GFX9-NEXT: .LBB19_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB19_87 +; GFX9-NEXT: .LBB19_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB19_90 +; GFX9-NEXT: .LBB19_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB19_93 +; GFX9-NEXT: .LBB19_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0x7fffffff +; GFX9-NEXT: s_min_i32 s4, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s5, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s4, 31 +; GFX9-NEXT: s_branch .LBB19_96 +; GFX9-NEXT: .LBB19_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_96: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s4, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB19_99 +; GFX9-NEXT: .LBB19_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB19_102 +; GFX9-NEXT: .LBB19_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB19_105 +; GFX9-NEXT: .LBB19_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB19_108 +; GFX9-NEXT: .LBB19_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB19_111 +; GFX9-NEXT: .LBB19_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB19_114 +; GFX9-NEXT: .LBB19_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB19_117 +; GFX9-NEXT: .LBB19_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB19_120 +; GFX9-NEXT: .LBB19_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB19_123 +; GFX9-NEXT: .LBB19_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB19_126 +; GFX9-NEXT: .LBB19_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB19_129 +; GFX9-NEXT: .LBB19_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB19_132 +; GFX9-NEXT: .LBB19_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB19_135 +; GFX9-NEXT: .LBB19_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB19_138 +; GFX9-NEXT: .LBB19_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB19_141 +; GFX9-NEXT: .LBB19_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB19_144 +; GFX9-NEXT: .LBB19_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB19_147 +; GFX9-NEXT: .LBB19_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB19_150 +; GFX9-NEXT: .LBB19_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB19_153 +; GFX9-NEXT: .LBB19_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB19_156 +; GFX9-NEXT: .LBB19_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB19_159 +; GFX9-NEXT: .LBB19_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB19_162 +; GFX9-NEXT: .LBB19_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB19_165 +; GFX9-NEXT: .LBB19_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB19_168 +; GFX9-NEXT: .LBB19_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB19_171 +; GFX9-NEXT: .LBB19_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB19_174 +; GFX9-NEXT: .LBB19_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB19_177 +; GFX9-NEXT: .LBB19_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB19_180 +; GFX9-NEXT: .LBB19_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB19_183 +; GFX9-NEXT: .LBB19_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB19_186 +; GFX9-NEXT: .LBB19_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB19_189 +; GFX9-NEXT: .LBB19_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX9-NEXT: s_min_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB19_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB19_192 +; GFX9-NEXT: .LBB19_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB19_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB19_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_cbranch_execz .LBB19_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX9-NEXT: s_min_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 +; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB19_2: +; GFX9-NEXT: .LBB19_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX9-NEXT: v_min_i32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: s_brev_b32 s6, -2 +; GFX1064-NEXT: v_writelane_b32 v1, s6, 0 +; GFX1064-NEXT: s_branch .LBB19_3 +; GFX1064-NEXT: .LBB19_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s2, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: s_min_i32 s6, s4, 0x7fffffff +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB19_6 +; GFX1064-NEXT: .LBB19_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB19_9 +; GFX1064-NEXT: .LBB19_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB19_12 +; GFX1064-NEXT: .LBB19_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB19_15 +; GFX1064-NEXT: .LBB19_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB19_18 +; GFX1064-NEXT: .LBB19_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB19_21 +; GFX1064-NEXT: .LBB19_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB19_24 +; GFX1064-NEXT: .LBB19_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB19_27 +; GFX1064-NEXT: .LBB19_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB19_30 +; GFX1064-NEXT: .LBB19_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB19_33 +; GFX1064-NEXT: .LBB19_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB19_36 +; GFX1064-NEXT: .LBB19_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB19_39 +; GFX1064-NEXT: .LBB19_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB19_42 +; GFX1064-NEXT: .LBB19_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB19_45 +; GFX1064-NEXT: .LBB19_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB19_48 +; GFX1064-NEXT: .LBB19_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB19_51 +; GFX1064-NEXT: .LBB19_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB19_54 +; GFX1064-NEXT: .LBB19_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB19_57 +; GFX1064-NEXT: .LBB19_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB19_60 +; GFX1064-NEXT: .LBB19_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB19_63 +; GFX1064-NEXT: .LBB19_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB19_66 +; GFX1064-NEXT: .LBB19_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB19_69 +; GFX1064-NEXT: .LBB19_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB19_72 +; GFX1064-NEXT: .LBB19_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB19_75 +; GFX1064-NEXT: .LBB19_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB19_78 +; GFX1064-NEXT: .LBB19_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB19_81 +; GFX1064-NEXT: .LBB19_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB19_84 +; GFX1064-NEXT: .LBB19_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB19_87 +; GFX1064-NEXT: .LBB19_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB19_90 +; GFX1064-NEXT: .LBB19_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s8, exec_lo, 2.0 +; GFX1064-NEXT: s_min_i32 s4, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_mov_b32 s9, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[8:9], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB19_93 +; GFX1064-NEXT: .LBB19_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0x7fffffff +; GFX1064-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1064-NEXT: s_min_i32 s4, s4, s2 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1064-NEXT: s_branch .LBB19_96 +; GFX1064-NEXT: .LBB19_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_min_i32 s6, s4, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB19_99 +; GFX1064-NEXT: .LBB19_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB19_102 +; GFX1064-NEXT: .LBB19_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB19_105 +; GFX1064-NEXT: .LBB19_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB19_108 +; GFX1064-NEXT: .LBB19_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB19_111 +; GFX1064-NEXT: .LBB19_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB19_114 +; GFX1064-NEXT: .LBB19_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB19_117 +; GFX1064-NEXT: .LBB19_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB19_120 +; GFX1064-NEXT: .LBB19_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB19_123 +; GFX1064-NEXT: .LBB19_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB19_126 +; GFX1064-NEXT: .LBB19_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB19_129 +; GFX1064-NEXT: .LBB19_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB19_132 +; GFX1064-NEXT: .LBB19_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB19_135 +; GFX1064-NEXT: .LBB19_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB19_138 +; GFX1064-NEXT: .LBB19_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB19_141 +; GFX1064-NEXT: .LBB19_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB19_144 +; GFX1064-NEXT: .LBB19_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB19_147 +; GFX1064-NEXT: .LBB19_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB19_150 +; GFX1064-NEXT: .LBB19_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB19_153 +; GFX1064-NEXT: .LBB19_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB19_156 +; GFX1064-NEXT: .LBB19_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB19_159 +; GFX1064-NEXT: .LBB19_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB19_162 +; GFX1064-NEXT: .LBB19_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB19_165 +; GFX1064-NEXT: .LBB19_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB19_168 +; GFX1064-NEXT: .LBB19_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB19_171 +; GFX1064-NEXT: .LBB19_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB19_174 +; GFX1064-NEXT: .LBB19_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB19_177 +; GFX1064-NEXT: .LBB19_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB19_180 +; GFX1064-NEXT: .LBB19_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB19_183 +; GFX1064-NEXT: .LBB19_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_min_i32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1064-NEXT: s_branch .LBB19_186 +; GFX1064-NEXT: .LBB19_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_min_i32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1064-NEXT: s_branch .LBB19_189 +; GFX1064-NEXT: .LBB19_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_min_i32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB19_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1064-NEXT: s_branch .LBB19_192 +; GFX1064-NEXT: .LBB19_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB19_192: +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB19_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB19_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0x7fffffff ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_min_i32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB19_2: +; GFX1064-NEXT: .LBB19_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: s_brev_b32 s4, -2 +; GFX1032-NEXT: v_writelane_b32 v1, s4, 0 +; GFX1032-NEXT: s_branch .LBB19_3 +; GFX1032-NEXT: .LBB19_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: s_min_i32 s2, s2, 0x7fffffff +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB19_6 +; GFX1032-NEXT: .LBB19_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB19_9 +; GFX1032-NEXT: .LBB19_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB19_12 +; GFX1032-NEXT: .LBB19_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB19_15 +; GFX1032-NEXT: .LBB19_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB19_18 +; GFX1032-NEXT: .LBB19_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB19_21 +; GFX1032-NEXT: .LBB19_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB19_24 +; GFX1032-NEXT: .LBB19_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB19_27 +; GFX1032-NEXT: .LBB19_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB19_30 +; GFX1032-NEXT: .LBB19_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB19_33 +; GFX1032-NEXT: .LBB19_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB19_36 +; GFX1032-NEXT: .LBB19_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB19_39 +; GFX1032-NEXT: .LBB19_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB19_42 +; GFX1032-NEXT: .LBB19_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB19_45 +; GFX1032-NEXT: .LBB19_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB19_48 +; GFX1032-NEXT: .LBB19_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB19_51 +; GFX1032-NEXT: .LBB19_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB19_54 +; GFX1032-NEXT: .LBB19_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB19_57 +; GFX1032-NEXT: .LBB19_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB19_60 +; GFX1032-NEXT: .LBB19_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB19_63 +; GFX1032-NEXT: .LBB19_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB19_66 +; GFX1032-NEXT: .LBB19_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB19_69 +; GFX1032-NEXT: .LBB19_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB19_72 +; GFX1032-NEXT: .LBB19_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB19_75 +; GFX1032-NEXT: .LBB19_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB19_78 +; GFX1032-NEXT: .LBB19_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB19_81 +; GFX1032-NEXT: .LBB19_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB19_84 +; GFX1032-NEXT: .LBB19_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB19_87 +; GFX1032-NEXT: .LBB19_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB19_90 +; GFX1032-NEXT: .LBB19_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1032-NEXT: s_branch .LBB19_93 +; GFX1032-NEXT: .LBB19_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_93: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB19_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1032-NEXT: s_branch .LBB19_96 +; GFX1032-NEXT: .LBB19_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB19_96: +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB19_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB19_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0x7fffffff ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: s_min_i32 s2, s2, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB19_2: +; GFX1032-NEXT: .LBB19_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: min_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-NEXT: s_brev_b32 s6, -2 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_writelane_b32 v1, s6, 0 +; GFX1164-NEXT: s_branch .LBB19_3 +; GFX1164-NEXT: .LBB19_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s2, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: s_min_i32 s6, s4, 0x7fffffff +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB19_6 +; GFX1164-NEXT: .LBB19_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB19_9 +; GFX1164-NEXT: .LBB19_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB19_12 +; GFX1164-NEXT: .LBB19_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB19_15 +; GFX1164-NEXT: .LBB19_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB19_18 +; GFX1164-NEXT: .LBB19_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB19_21 +; GFX1164-NEXT: .LBB19_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB19_24 +; GFX1164-NEXT: .LBB19_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB19_27 +; GFX1164-NEXT: .LBB19_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB19_30 +; GFX1164-NEXT: .LBB19_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB19_33 +; GFX1164-NEXT: .LBB19_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB19_36 +; GFX1164-NEXT: .LBB19_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB19_39 +; GFX1164-NEXT: .LBB19_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB19_42 +; GFX1164-NEXT: .LBB19_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB19_45 +; GFX1164-NEXT: .LBB19_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB19_48 +; GFX1164-NEXT: .LBB19_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB19_51 +; GFX1164-NEXT: .LBB19_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB19_54 +; GFX1164-NEXT: .LBB19_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB19_57 +; GFX1164-NEXT: .LBB19_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB19_60 +; GFX1164-NEXT: .LBB19_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB19_63 +; GFX1164-NEXT: .LBB19_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB19_66 +; GFX1164-NEXT: .LBB19_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB19_69 +; GFX1164-NEXT: .LBB19_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB19_72 +; GFX1164-NEXT: .LBB19_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB19_75 +; GFX1164-NEXT: .LBB19_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB19_78 +; GFX1164-NEXT: .LBB19_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB19_81 +; GFX1164-NEXT: .LBB19_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB19_84 +; GFX1164-NEXT: .LBB19_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB19_87 +; GFX1164-NEXT: .LBB19_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB19_90 +; GFX1164-NEXT: .LBB19_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s8, exec_lo, 2.0 +; GFX1164-NEXT: s_min_i32 s4, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_mov_b32 s9, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[8:9], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB19_93 +; GFX1164-NEXT: .LBB19_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0x7fffffff +; GFX1164-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1164-NEXT: s_min_i32 s4, s4, s2 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1164-NEXT: s_branch .LBB19_96 +; GFX1164-NEXT: .LBB19_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_min_i32 s6, s4, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB19_99 +; GFX1164-NEXT: .LBB19_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB19_102 +; GFX1164-NEXT: .LBB19_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB19_105 +; GFX1164-NEXT: .LBB19_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB19_108 +; GFX1164-NEXT: .LBB19_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB19_111 +; GFX1164-NEXT: .LBB19_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB19_114 +; GFX1164-NEXT: .LBB19_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB19_117 +; GFX1164-NEXT: .LBB19_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB19_120 +; GFX1164-NEXT: .LBB19_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB19_123 +; GFX1164-NEXT: .LBB19_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB19_126 +; GFX1164-NEXT: .LBB19_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB19_129 +; GFX1164-NEXT: .LBB19_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB19_132 +; GFX1164-NEXT: .LBB19_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB19_135 +; GFX1164-NEXT: .LBB19_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB19_138 +; GFX1164-NEXT: .LBB19_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB19_141 +; GFX1164-NEXT: .LBB19_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB19_144 +; GFX1164-NEXT: .LBB19_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB19_147 +; GFX1164-NEXT: .LBB19_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB19_150 +; GFX1164-NEXT: .LBB19_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB19_153 +; GFX1164-NEXT: .LBB19_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB19_156 +; GFX1164-NEXT: .LBB19_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB19_159 +; GFX1164-NEXT: .LBB19_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB19_162 +; GFX1164-NEXT: .LBB19_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB19_165 +; GFX1164-NEXT: .LBB19_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB19_168 +; GFX1164-NEXT: .LBB19_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB19_171 +; GFX1164-NEXT: .LBB19_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB19_174 +; GFX1164-NEXT: .LBB19_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB19_177 +; GFX1164-NEXT: .LBB19_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB19_180 +; GFX1164-NEXT: .LBB19_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB19_183 +; GFX1164-NEXT: .LBB19_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_min_i32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1164-NEXT: s_branch .LBB19_186 +; GFX1164-NEXT: .LBB19_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0x7fffffff +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_min_i32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1164-NEXT: s_branch .LBB19_189 +; GFX1164-NEXT: .LBB19_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0x7fffffff +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_min_i32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB19_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1164-NEXT: s_branch .LBB19_192 +; GFX1164-NEXT: .LBB19_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB19_192: +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB19_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB19_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0x7fffffff ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_min_i32 s4, s6, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1164-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB19_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB19_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5261,53 +40360,513 @@ ; ; GFX1132-LABEL: min_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_bfrev_b32_e32 v3, -2 -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: s_brev_b32 s4, -2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_writelane_b32 v1, s4, 0 +; GFX1132-NEXT: s_branch .LBB19_3 +; GFX1132-NEXT: .LBB19_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: s_min_i32 s2, s2, 0x7fffffff +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB19_6 +; GFX1132-NEXT: .LBB19_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB19_9 +; GFX1132-NEXT: .LBB19_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB19_12 +; GFX1132-NEXT: .LBB19_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB19_15 +; GFX1132-NEXT: .LBB19_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB19_18 +; GFX1132-NEXT: .LBB19_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB19_21 +; GFX1132-NEXT: .LBB19_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB19_24 +; GFX1132-NEXT: .LBB19_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB19_27 +; GFX1132-NEXT: .LBB19_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB19_30 +; GFX1132-NEXT: .LBB19_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB19_33 +; GFX1132-NEXT: .LBB19_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB19_36 +; GFX1132-NEXT: .LBB19_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB19_39 +; GFX1132-NEXT: .LBB19_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB19_42 +; GFX1132-NEXT: .LBB19_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB19_45 +; GFX1132-NEXT: .LBB19_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB19_48 +; GFX1132-NEXT: .LBB19_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB19_51 +; GFX1132-NEXT: .LBB19_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB19_54 +; GFX1132-NEXT: .LBB19_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB19_57 +; GFX1132-NEXT: .LBB19_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB19_60 +; GFX1132-NEXT: .LBB19_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB19_63 +; GFX1132-NEXT: .LBB19_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB19_66 +; GFX1132-NEXT: .LBB19_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB19_69 +; GFX1132-NEXT: .LBB19_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB19_72 +; GFX1132-NEXT: .LBB19_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB19_75 +; GFX1132-NEXT: .LBB19_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB19_78 +; GFX1132-NEXT: .LBB19_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB19_81 +; GFX1132-NEXT: .LBB19_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB19_84 +; GFX1132-NEXT: .LBB19_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB19_87 +; GFX1132-NEXT: .LBB19_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB19_90 +; GFX1132-NEXT: .LBB19_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1132-NEXT: s_branch .LBB19_93 +; GFX1132-NEXT: .LBB19_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_93: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB19_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1132-NEXT: s_branch .LBB19_96 +; GFX1132-NEXT: .LBB19_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB19_96: +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB19_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB19_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0x7fffffff ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: s_min_i32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v4 +; GFX1132-NEXT: ds_min_rtn_i32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB19_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB19_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_i32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5588,269 +41147,4527 @@ ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_2 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB21_3 +; GFX8-NEXT: .LBB21_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB21_6 +; GFX8-NEXT: .LBB21_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB21_9 +; GFX8-NEXT: .LBB21_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB21_12 +; GFX8-NEXT: .LBB21_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB21_15 +; GFX8-NEXT: .LBB21_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB21_18 +; GFX8-NEXT: .LBB21_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB21_21 +; GFX8-NEXT: .LBB21_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB21_24 +; GFX8-NEXT: .LBB21_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB21_27 +; GFX8-NEXT: .LBB21_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB21_30 +; GFX8-NEXT: .LBB21_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB21_33 +; GFX8-NEXT: .LBB21_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB21_36 +; GFX8-NEXT: .LBB21_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB21_39 +; GFX8-NEXT: .LBB21_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB21_42 +; GFX8-NEXT: .LBB21_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB21_45 +; GFX8-NEXT: .LBB21_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB21_48 +; GFX8-NEXT: .LBB21_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB21_51 +; GFX8-NEXT: .LBB21_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB21_54 +; GFX8-NEXT: .LBB21_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB21_57 +; GFX8-NEXT: .LBB21_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB21_60 +; GFX8-NEXT: .LBB21_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB21_63 +; GFX8-NEXT: .LBB21_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB21_66 +; GFX8-NEXT: .LBB21_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB21_69 +; GFX8-NEXT: .LBB21_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB21_72 +; GFX8-NEXT: .LBB21_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB21_75 +; GFX8-NEXT: .LBB21_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB21_78 +; GFX8-NEXT: .LBB21_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB21_81 +; GFX8-NEXT: .LBB21_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB21_84 +; GFX8-NEXT: .LBB21_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB21_87 +; GFX8-NEXT: .LBB21_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB21_90 +; GFX8-NEXT: .LBB21_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_max_u32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB21_93 +; GFX8-NEXT: .LBB21_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_max_u32 s3, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s6, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s3, 31 +; GFX8-NEXT: s_branch .LBB21_96 +; GFX8-NEXT: .LBB21_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s6, 0 +; GFX8-NEXT: s_max_u32 s6, s3, s4 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB21_99 +; GFX8-NEXT: .LBB21_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB21_102 +; GFX8-NEXT: .LBB21_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB21_105 +; GFX8-NEXT: .LBB21_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB21_108 +; GFX8-NEXT: .LBB21_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB21_111 +; GFX8-NEXT: .LBB21_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB21_114 +; GFX8-NEXT: .LBB21_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB21_117 +; GFX8-NEXT: .LBB21_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB21_120 +; GFX8-NEXT: .LBB21_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB21_123 +; GFX8-NEXT: .LBB21_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB21_126 +; GFX8-NEXT: .LBB21_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB21_129 +; GFX8-NEXT: .LBB21_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB21_132 +; GFX8-NEXT: .LBB21_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB21_135 +; GFX8-NEXT: .LBB21_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB21_138 +; GFX8-NEXT: .LBB21_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB21_141 +; GFX8-NEXT: .LBB21_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB21_144 +; GFX8-NEXT: .LBB21_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB21_147 +; GFX8-NEXT: .LBB21_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB21_150 +; GFX8-NEXT: .LBB21_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB21_153 +; GFX8-NEXT: .LBB21_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB21_156 +; GFX8-NEXT: .LBB21_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB21_159 +; GFX8-NEXT: .LBB21_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB21_162 +; GFX8-NEXT: .LBB21_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB21_165 +; GFX8-NEXT: .LBB21_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB21_168 +; GFX8-NEXT: .LBB21_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB21_171 +; GFX8-NEXT: .LBB21_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB21_174 +; GFX8-NEXT: .LBB21_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB21_177 +; GFX8-NEXT: .LBB21_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB21_180 +; GFX8-NEXT: .LBB21_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB21_183 +; GFX8-NEXT: .LBB21_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB21_186 +; GFX8-NEXT: .LBB21_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB21_189 +; GFX8-NEXT: .LBB21_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_max_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB21_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB21_192 +; GFX8-NEXT: .LBB21_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB21_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB21_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_cbranch_execz .LBB21_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_max_u32 s4, s6, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB21_2: +; GFX8-NEXT: .LBB21_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX8-NEXT: v_max_u32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB21_3 +; GFX9-NEXT: .LBB21_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB21_6 +; GFX9-NEXT: .LBB21_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB21_9 +; GFX9-NEXT: .LBB21_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB21_12 +; GFX9-NEXT: .LBB21_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB21_15 +; GFX9-NEXT: .LBB21_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB21_18 +; GFX9-NEXT: .LBB21_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB21_21 +; GFX9-NEXT: .LBB21_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB21_24 +; GFX9-NEXT: .LBB21_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB21_27 +; GFX9-NEXT: .LBB21_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB21_30 +; GFX9-NEXT: .LBB21_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB21_33 +; GFX9-NEXT: .LBB21_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB21_36 +; GFX9-NEXT: .LBB21_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB21_39 +; GFX9-NEXT: .LBB21_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB21_42 +; GFX9-NEXT: .LBB21_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB21_45 +; GFX9-NEXT: .LBB21_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB21_48 +; GFX9-NEXT: .LBB21_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB21_51 +; GFX9-NEXT: .LBB21_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB21_54 +; GFX9-NEXT: .LBB21_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB21_57 +; GFX9-NEXT: .LBB21_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB21_60 +; GFX9-NEXT: .LBB21_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB21_63 +; GFX9-NEXT: .LBB21_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB21_66 +; GFX9-NEXT: .LBB21_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB21_69 +; GFX9-NEXT: .LBB21_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB21_72 +; GFX9-NEXT: .LBB21_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB21_75 +; GFX9-NEXT: .LBB21_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB21_78 +; GFX9-NEXT: .LBB21_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB21_81 +; GFX9-NEXT: .LBB21_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB21_84 +; GFX9-NEXT: .LBB21_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB21_87 +; GFX9-NEXT: .LBB21_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB21_90 +; GFX9-NEXT: .LBB21_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_max_u32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB21_93 +; GFX9-NEXT: .LBB21_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_max_u32 s3, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s6, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s3, 31 +; GFX9-NEXT: s_branch .LBB21_96 +; GFX9-NEXT: .LBB21_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s6, 0 +; GFX9-NEXT: s_max_u32 s6, s3, s4 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB21_99 +; GFX9-NEXT: .LBB21_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB21_102 +; GFX9-NEXT: .LBB21_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB21_105 +; GFX9-NEXT: .LBB21_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB21_108 +; GFX9-NEXT: .LBB21_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB21_111 +; GFX9-NEXT: .LBB21_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB21_114 +; GFX9-NEXT: .LBB21_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB21_117 +; GFX9-NEXT: .LBB21_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB21_120 +; GFX9-NEXT: .LBB21_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB21_123 +; GFX9-NEXT: .LBB21_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB21_126 +; GFX9-NEXT: .LBB21_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB21_129 +; GFX9-NEXT: .LBB21_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB21_132 +; GFX9-NEXT: .LBB21_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB21_135 +; GFX9-NEXT: .LBB21_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB21_138 +; GFX9-NEXT: .LBB21_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB21_141 +; GFX9-NEXT: .LBB21_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB21_144 +; GFX9-NEXT: .LBB21_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB21_147 +; GFX9-NEXT: .LBB21_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB21_150 +; GFX9-NEXT: .LBB21_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB21_153 +; GFX9-NEXT: .LBB21_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB21_156 +; GFX9-NEXT: .LBB21_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB21_159 +; GFX9-NEXT: .LBB21_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB21_162 +; GFX9-NEXT: .LBB21_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB21_165 +; GFX9-NEXT: .LBB21_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB21_168 +; GFX9-NEXT: .LBB21_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB21_171 +; GFX9-NEXT: .LBB21_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB21_174 +; GFX9-NEXT: .LBB21_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB21_177 +; GFX9-NEXT: .LBB21_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB21_180 +; GFX9-NEXT: .LBB21_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB21_183 +; GFX9-NEXT: .LBB21_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB21_186 +; GFX9-NEXT: .LBB21_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB21_189 +; GFX9-NEXT: .LBB21_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_max_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB21_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB21_192 +; GFX9-NEXT: .LBB21_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB21_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB21_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_cbranch_execz .LBB21_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_max_u32 s4, s6, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_max_rtn_u32 v0, v3, v0 +; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB21_2: +; GFX9-NEXT: .LBB21_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_max_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_max_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1064-NEXT: s_branch .LBB21_3 +; GFX1064-NEXT: .LBB21_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB21_6 +; GFX1064-NEXT: .LBB21_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB21_9 +; GFX1064-NEXT: .LBB21_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB21_12 +; GFX1064-NEXT: .LBB21_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB21_15 +; GFX1064-NEXT: .LBB21_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB21_18 +; GFX1064-NEXT: .LBB21_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB21_21 +; GFX1064-NEXT: .LBB21_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB21_24 +; GFX1064-NEXT: .LBB21_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB21_27 +; GFX1064-NEXT: .LBB21_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB21_30 +; GFX1064-NEXT: .LBB21_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB21_33 +; GFX1064-NEXT: .LBB21_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB21_36 +; GFX1064-NEXT: .LBB21_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB21_39 +; GFX1064-NEXT: .LBB21_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB21_42 +; GFX1064-NEXT: .LBB21_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB21_45 +; GFX1064-NEXT: .LBB21_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB21_48 +; GFX1064-NEXT: .LBB21_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB21_51 +; GFX1064-NEXT: .LBB21_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB21_54 +; GFX1064-NEXT: .LBB21_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB21_57 +; GFX1064-NEXT: .LBB21_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB21_60 +; GFX1064-NEXT: .LBB21_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB21_63 +; GFX1064-NEXT: .LBB21_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB21_66 +; GFX1064-NEXT: .LBB21_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB21_69 +; GFX1064-NEXT: .LBB21_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB21_72 +; GFX1064-NEXT: .LBB21_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB21_75 +; GFX1064-NEXT: .LBB21_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB21_78 +; GFX1064-NEXT: .LBB21_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB21_81 +; GFX1064-NEXT: .LBB21_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB21_84 +; GFX1064-NEXT: .LBB21_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_max_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB21_87 +; GFX1064-NEXT: .LBB21_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB21_90 +; GFX1064-NEXT: .LBB21_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_max_u32 s4, s6, s2 +; GFX1064-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB21_93 +; GFX1064-NEXT: .LBB21_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, 0 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1064-NEXT: s_max_u32 s4, s4, s2 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1064-NEXT: s_branch .LBB21_96 +; GFX1064-NEXT: .LBB21_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s3, s5, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_max_u32 s6, s4, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB21_99 +; GFX1064-NEXT: .LBB21_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB21_102 +; GFX1064-NEXT: .LBB21_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB21_105 +; GFX1064-NEXT: .LBB21_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB21_108 +; GFX1064-NEXT: .LBB21_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB21_111 +; GFX1064-NEXT: .LBB21_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB21_114 +; GFX1064-NEXT: .LBB21_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB21_117 +; GFX1064-NEXT: .LBB21_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB21_120 +; GFX1064-NEXT: .LBB21_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB21_123 +; GFX1064-NEXT: .LBB21_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB21_126 +; GFX1064-NEXT: .LBB21_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB21_129 +; GFX1064-NEXT: .LBB21_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB21_132 +; GFX1064-NEXT: .LBB21_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB21_135 +; GFX1064-NEXT: .LBB21_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB21_138 +; GFX1064-NEXT: .LBB21_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB21_141 +; GFX1064-NEXT: .LBB21_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB21_144 +; GFX1064-NEXT: .LBB21_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB21_147 +; GFX1064-NEXT: .LBB21_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB21_150 +; GFX1064-NEXT: .LBB21_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB21_153 +; GFX1064-NEXT: .LBB21_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB21_156 +; GFX1064-NEXT: .LBB21_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB21_159 +; GFX1064-NEXT: .LBB21_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB21_162 +; GFX1064-NEXT: .LBB21_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB21_165 +; GFX1064-NEXT: .LBB21_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB21_168 +; GFX1064-NEXT: .LBB21_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB21_171 +; GFX1064-NEXT: .LBB21_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB21_174 +; GFX1064-NEXT: .LBB21_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB21_177 +; GFX1064-NEXT: .LBB21_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB21_180 +; GFX1064-NEXT: .LBB21_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB21_183 +; GFX1064-NEXT: .LBB21_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_max_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1064-NEXT: s_branch .LBB21_186 +; GFX1064-NEXT: .LBB21_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1064-NEXT: s_branch .LBB21_189 +; GFX1064-NEXT: .LBB21_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_max_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB21_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1064-NEXT: s_branch .LBB21_192 +; GFX1064-NEXT: .LBB21_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB21_192: +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB21_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB21_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-NEXT: s_max_u32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB21_2: +; GFX1064-NEXT: .LBB21_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1032-NEXT: s_branch .LBB21_3 +; GFX1032-NEXT: .LBB21_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB21_6 +; GFX1032-NEXT: .LBB21_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB21_9 +; GFX1032-NEXT: .LBB21_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB21_12 +; GFX1032-NEXT: .LBB21_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB21_15 +; GFX1032-NEXT: .LBB21_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB21_18 +; GFX1032-NEXT: .LBB21_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB21_21 +; GFX1032-NEXT: .LBB21_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB21_24 +; GFX1032-NEXT: .LBB21_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB21_27 +; GFX1032-NEXT: .LBB21_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB21_30 +; GFX1032-NEXT: .LBB21_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB21_33 +; GFX1032-NEXT: .LBB21_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB21_36 +; GFX1032-NEXT: .LBB21_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB21_39 +; GFX1032-NEXT: .LBB21_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB21_42 +; GFX1032-NEXT: .LBB21_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB21_45 +; GFX1032-NEXT: .LBB21_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB21_48 +; GFX1032-NEXT: .LBB21_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB21_51 +; GFX1032-NEXT: .LBB21_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB21_54 +; GFX1032-NEXT: .LBB21_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB21_57 +; GFX1032-NEXT: .LBB21_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB21_60 +; GFX1032-NEXT: .LBB21_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB21_63 +; GFX1032-NEXT: .LBB21_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB21_66 +; GFX1032-NEXT: .LBB21_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB21_69 +; GFX1032-NEXT: .LBB21_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB21_72 +; GFX1032-NEXT: .LBB21_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB21_75 +; GFX1032-NEXT: .LBB21_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB21_78 +; GFX1032-NEXT: .LBB21_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB21_81 +; GFX1032-NEXT: .LBB21_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB21_84 +; GFX1032-NEXT: .LBB21_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB21_87 +; GFX1032-NEXT: .LBB21_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB21_90 +; GFX1032-NEXT: .LBB21_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1032-NEXT: s_branch .LBB21_93 +; GFX1032-NEXT: .LBB21_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_93: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB21_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1032-NEXT: s_branch .LBB21_96 +; GFX1032-NEXT: .LBB21_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB21_96: +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB21_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB21_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-NEXT: s_max_u32 s2, s2, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB21_2: +; GFX1032-NEXT: .LBB21_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: umax_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_mov_b32_e32 v3, 0 -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: v_mov_b32_e32 v4, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1164-NEXT: s_branch .LBB21_3 +; GFX1164-NEXT: .LBB21_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s6, s2, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB21_6 +; GFX1164-NEXT: .LBB21_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB21_9 +; GFX1164-NEXT: .LBB21_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB21_12 +; GFX1164-NEXT: .LBB21_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB21_15 +; GFX1164-NEXT: .LBB21_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB21_18 +; GFX1164-NEXT: .LBB21_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB21_21 +; GFX1164-NEXT: .LBB21_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB21_24 +; GFX1164-NEXT: .LBB21_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB21_27 +; GFX1164-NEXT: .LBB21_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB21_30 +; GFX1164-NEXT: .LBB21_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB21_33 +; GFX1164-NEXT: .LBB21_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB21_36 +; GFX1164-NEXT: .LBB21_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB21_39 +; GFX1164-NEXT: .LBB21_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB21_42 +; GFX1164-NEXT: .LBB21_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB21_45 +; GFX1164-NEXT: .LBB21_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB21_48 +; GFX1164-NEXT: .LBB21_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB21_51 +; GFX1164-NEXT: .LBB21_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB21_54 +; GFX1164-NEXT: .LBB21_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB21_57 +; GFX1164-NEXT: .LBB21_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB21_60 +; GFX1164-NEXT: .LBB21_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB21_63 +; GFX1164-NEXT: .LBB21_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB21_66 +; GFX1164-NEXT: .LBB21_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB21_69 +; GFX1164-NEXT: .LBB21_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB21_72 +; GFX1164-NEXT: .LBB21_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB21_75 +; GFX1164-NEXT: .LBB21_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB21_78 +; GFX1164-NEXT: .LBB21_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB21_81 +; GFX1164-NEXT: .LBB21_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB21_84 +; GFX1164-NEXT: .LBB21_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_max_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB21_87 +; GFX1164-NEXT: .LBB21_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB21_90 +; GFX1164-NEXT: .LBB21_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_max_u32 s4, s6, s2 +; GFX1164-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX1164-NEXT: s_mov_b32 s7, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB21_93 +; GFX1164-NEXT: .LBB21_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, 0 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1164-NEXT: s_max_u32 s4, s4, s2 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1164-NEXT: s_branch .LBB21_96 +; GFX1164-NEXT: .LBB21_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s3, s5, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_max_u32 s6, s4, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB21_99 +; GFX1164-NEXT: .LBB21_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB21_102 +; GFX1164-NEXT: .LBB21_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB21_105 +; GFX1164-NEXT: .LBB21_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB21_108 +; GFX1164-NEXT: .LBB21_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB21_111 +; GFX1164-NEXT: .LBB21_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB21_114 +; GFX1164-NEXT: .LBB21_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB21_117 +; GFX1164-NEXT: .LBB21_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB21_120 +; GFX1164-NEXT: .LBB21_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB21_123 +; GFX1164-NEXT: .LBB21_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB21_126 +; GFX1164-NEXT: .LBB21_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB21_129 +; GFX1164-NEXT: .LBB21_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB21_132 +; GFX1164-NEXT: .LBB21_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB21_135 +; GFX1164-NEXT: .LBB21_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB21_138 +; GFX1164-NEXT: .LBB21_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB21_141 +; GFX1164-NEXT: .LBB21_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB21_144 +; GFX1164-NEXT: .LBB21_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB21_147 +; GFX1164-NEXT: .LBB21_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB21_150 +; GFX1164-NEXT: .LBB21_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB21_153 +; GFX1164-NEXT: .LBB21_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB21_156 +; GFX1164-NEXT: .LBB21_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB21_159 +; GFX1164-NEXT: .LBB21_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB21_162 +; GFX1164-NEXT: .LBB21_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB21_165 +; GFX1164-NEXT: .LBB21_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB21_168 +; GFX1164-NEXT: .LBB21_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB21_171 +; GFX1164-NEXT: .LBB21_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB21_174 +; GFX1164-NEXT: .LBB21_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB21_177 +; GFX1164-NEXT: .LBB21_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB21_180 +; GFX1164-NEXT: .LBB21_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB21_183 +; GFX1164-NEXT: .LBB21_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_max_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1164-NEXT: s_branch .LBB21_186 +; GFX1164-NEXT: .LBB21_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1164-NEXT: s_branch .LBB21_189 +; GFX1164-NEXT: .LBB21_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, 0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_max_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB21_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1164-NEXT: s_branch .LBB21_192 +; GFX1164-NEXT: .LBB21_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB21_192: +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB21_2 -; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: v_mov_b32_e32 v0, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB21_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, 0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-NEXT: s_max_u32 s4, s6, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1164-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB21_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB21_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -5858,53 +45675,510 @@ ; ; GFX1132-LABEL: umax_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_writelane_b32 v1, 0, 0 +; GFX1132-NEXT: s_branch .LBB21_3 +; GFX1132-NEXT: .LBB21_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB21_6 +; GFX1132-NEXT: .LBB21_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB21_9 +; GFX1132-NEXT: .LBB21_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB21_12 +; GFX1132-NEXT: .LBB21_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB21_15 +; GFX1132-NEXT: .LBB21_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB21_18 +; GFX1132-NEXT: .LBB21_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB21_21 +; GFX1132-NEXT: .LBB21_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB21_24 +; GFX1132-NEXT: .LBB21_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB21_27 +; GFX1132-NEXT: .LBB21_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB21_30 +; GFX1132-NEXT: .LBB21_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB21_33 +; GFX1132-NEXT: .LBB21_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB21_36 +; GFX1132-NEXT: .LBB21_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB21_39 +; GFX1132-NEXT: .LBB21_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB21_42 +; GFX1132-NEXT: .LBB21_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB21_45 +; GFX1132-NEXT: .LBB21_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB21_48 +; GFX1132-NEXT: .LBB21_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB21_51 +; GFX1132-NEXT: .LBB21_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB21_54 +; GFX1132-NEXT: .LBB21_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB21_57 +; GFX1132-NEXT: .LBB21_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB21_60 +; GFX1132-NEXT: .LBB21_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB21_63 +; GFX1132-NEXT: .LBB21_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB21_66 +; GFX1132-NEXT: .LBB21_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB21_69 +; GFX1132-NEXT: .LBB21_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB21_72 +; GFX1132-NEXT: .LBB21_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB21_75 +; GFX1132-NEXT: .LBB21_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB21_78 +; GFX1132-NEXT: .LBB21_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB21_81 +; GFX1132-NEXT: .LBB21_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB21_84 +; GFX1132-NEXT: .LBB21_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB21_87 +; GFX1132-NEXT: .LBB21_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB21_90 +; GFX1132-NEXT: .LBB21_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1132-NEXT: s_branch .LBB21_93 +; GFX1132-NEXT: .LBB21_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_93: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB21_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1132-NEXT: s_branch .LBB21_96 +; GFX1132-NEXT: .LBB21_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB21_96: +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB21_2 -; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: v_mov_b32_e32 v0, s4 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB21_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, 0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0 +; GFX1132-NEXT: s_max_u32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_max_rtn_u32 v0, v4, v0 +; GFX1132-NEXT: ds_max_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB21_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB21_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_max_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6180,273 +46454,4557 @@ ; ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, -1 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, -1 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, -1, 0 +; GFX8-NEXT: s_branch .LBB23_3 +; GFX8-NEXT: .LBB23_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, -1 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB23_6 +; GFX8-NEXT: .LBB23_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB23_9 +; GFX8-NEXT: .LBB23_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB23_12 +; GFX8-NEXT: .LBB23_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB23_15 +; GFX8-NEXT: .LBB23_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB23_18 +; GFX8-NEXT: .LBB23_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB23_21 +; GFX8-NEXT: .LBB23_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB23_24 +; GFX8-NEXT: .LBB23_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB23_27 +; GFX8-NEXT: .LBB23_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB23_30 +; GFX8-NEXT: .LBB23_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB23_33 +; GFX8-NEXT: .LBB23_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB23_36 +; GFX8-NEXT: .LBB23_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB23_39 +; GFX8-NEXT: .LBB23_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB23_42 +; GFX8-NEXT: .LBB23_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB23_45 +; GFX8-NEXT: .LBB23_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB23_48 +; GFX8-NEXT: .LBB23_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB23_51 +; GFX8-NEXT: .LBB23_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB23_54 +; GFX8-NEXT: .LBB23_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB23_57 +; GFX8-NEXT: .LBB23_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB23_60 +; GFX8-NEXT: .LBB23_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB23_63 +; GFX8-NEXT: .LBB23_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB23_66 +; GFX8-NEXT: .LBB23_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB23_69 +; GFX8-NEXT: .LBB23_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB23_72 +; GFX8-NEXT: .LBB23_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB23_75 +; GFX8-NEXT: .LBB23_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB23_78 +; GFX8-NEXT: .LBB23_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB23_81 +; GFX8-NEXT: .LBB23_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB23_84 +; GFX8-NEXT: .LBB23_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB23_87 +; GFX8-NEXT: .LBB23_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB23_90 +; GFX8-NEXT: .LBB23_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB23_93 +; GFX8-NEXT: .LBB23_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, -1 +; GFX8-NEXT: s_min_u32 s4, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s5, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s4, 31 +; GFX8-NEXT: s_branch .LBB23_96 +; GFX8-NEXT: .LBB23_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_96: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, -1 +; GFX8-NEXT: s_min_u32 s6, s4, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB23_99 +; GFX8-NEXT: .LBB23_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB23_102 +; GFX8-NEXT: .LBB23_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB23_105 +; GFX8-NEXT: .LBB23_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB23_108 +; GFX8-NEXT: .LBB23_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB23_111 +; GFX8-NEXT: .LBB23_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB23_114 +; GFX8-NEXT: .LBB23_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB23_117 +; GFX8-NEXT: .LBB23_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB23_120 +; GFX8-NEXT: .LBB23_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB23_123 +; GFX8-NEXT: .LBB23_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB23_126 +; GFX8-NEXT: .LBB23_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB23_129 +; GFX8-NEXT: .LBB23_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB23_132 +; GFX8-NEXT: .LBB23_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB23_135 +; GFX8-NEXT: .LBB23_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB23_138 +; GFX8-NEXT: .LBB23_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB23_141 +; GFX8-NEXT: .LBB23_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB23_144 +; GFX8-NEXT: .LBB23_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB23_147 +; GFX8-NEXT: .LBB23_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB23_150 +; GFX8-NEXT: .LBB23_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB23_153 +; GFX8-NEXT: .LBB23_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB23_156 +; GFX8-NEXT: .LBB23_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB23_159 +; GFX8-NEXT: .LBB23_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB23_162 +; GFX8-NEXT: .LBB23_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB23_165 +; GFX8-NEXT: .LBB23_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB23_168 +; GFX8-NEXT: .LBB23_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB23_171 +; GFX8-NEXT: .LBB23_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB23_174 +; GFX8-NEXT: .LBB23_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB23_177 +; GFX8-NEXT: .LBB23_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB23_180 +; GFX8-NEXT: .LBB23_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB23_183 +; GFX8-NEXT: .LBB23_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB23_186 +; GFX8-NEXT: .LBB23_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB23_189 +; GFX8-NEXT: .LBB23_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, -1 +; GFX8-NEXT: s_min_u32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB23_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB23_192 +; GFX8-NEXT: .LBB23_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB23_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB23_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_cbranch_execz .LBB23_194 +; GFX8-NEXT: ; %bb.193: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, -1 +; GFX8-NEXT: s_min_u32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: .LBB23_2: +; GFX8-NEXT: .LBB23_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX8-NEXT: v_min_u32_e32 v0, s4, v1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, -1 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, -1 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_writelane_b32 v1, -1, 0 +; GFX9-NEXT: s_branch .LBB23_3 +; GFX9-NEXT: .LBB23_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, -1 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB23_6 +; GFX9-NEXT: .LBB23_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB23_9 +; GFX9-NEXT: .LBB23_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB23_12 +; GFX9-NEXT: .LBB23_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB23_15 +; GFX9-NEXT: .LBB23_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB23_18 +; GFX9-NEXT: .LBB23_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB23_21 +; GFX9-NEXT: .LBB23_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB23_24 +; GFX9-NEXT: .LBB23_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB23_27 +; GFX9-NEXT: .LBB23_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB23_30 +; GFX9-NEXT: .LBB23_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB23_33 +; GFX9-NEXT: .LBB23_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB23_36 +; GFX9-NEXT: .LBB23_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB23_39 +; GFX9-NEXT: .LBB23_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB23_42 +; GFX9-NEXT: .LBB23_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB23_45 +; GFX9-NEXT: .LBB23_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB23_48 +; GFX9-NEXT: .LBB23_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB23_51 +; GFX9-NEXT: .LBB23_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB23_54 +; GFX9-NEXT: .LBB23_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB23_57 +; GFX9-NEXT: .LBB23_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB23_60 +; GFX9-NEXT: .LBB23_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB23_63 +; GFX9-NEXT: .LBB23_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB23_66 +; GFX9-NEXT: .LBB23_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB23_69 +; GFX9-NEXT: .LBB23_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB23_72 +; GFX9-NEXT: .LBB23_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB23_75 +; GFX9-NEXT: .LBB23_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB23_78 +; GFX9-NEXT: .LBB23_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB23_81 +; GFX9-NEXT: .LBB23_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB23_84 +; GFX9-NEXT: .LBB23_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB23_87 +; GFX9-NEXT: .LBB23_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB23_90 +; GFX9-NEXT: .LBB23_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB23_93 +; GFX9-NEXT: .LBB23_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, -1 +; GFX9-NEXT: s_min_u32 s4, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s5, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s4, 31 +; GFX9-NEXT: s_branch .LBB23_96 +; GFX9-NEXT: .LBB23_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_96: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, -1 +; GFX9-NEXT: s_min_u32 s6, s4, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB23_99 +; GFX9-NEXT: .LBB23_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB23_102 +; GFX9-NEXT: .LBB23_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB23_105 +; GFX9-NEXT: .LBB23_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB23_108 +; GFX9-NEXT: .LBB23_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB23_111 +; GFX9-NEXT: .LBB23_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB23_114 +; GFX9-NEXT: .LBB23_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB23_117 +; GFX9-NEXT: .LBB23_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB23_120 +; GFX9-NEXT: .LBB23_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB23_123 +; GFX9-NEXT: .LBB23_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB23_126 +; GFX9-NEXT: .LBB23_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB23_129 +; GFX9-NEXT: .LBB23_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB23_132 +; GFX9-NEXT: .LBB23_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB23_135 +; GFX9-NEXT: .LBB23_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB23_138 +; GFX9-NEXT: .LBB23_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB23_141 +; GFX9-NEXT: .LBB23_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB23_144 +; GFX9-NEXT: .LBB23_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB23_147 +; GFX9-NEXT: .LBB23_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB23_150 +; GFX9-NEXT: .LBB23_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB23_153 +; GFX9-NEXT: .LBB23_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB23_156 +; GFX9-NEXT: .LBB23_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB23_159 +; GFX9-NEXT: .LBB23_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB23_162 +; GFX9-NEXT: .LBB23_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB23_165 +; GFX9-NEXT: .LBB23_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB23_168 +; GFX9-NEXT: .LBB23_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB23_171 +; GFX9-NEXT: .LBB23_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB23_174 +; GFX9-NEXT: .LBB23_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB23_177 +; GFX9-NEXT: .LBB23_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB23_180 +; GFX9-NEXT: .LBB23_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB23_183 +; GFX9-NEXT: .LBB23_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB23_186 +; GFX9-NEXT: .LBB23_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB23_189 +; GFX9-NEXT: .LBB23_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_min_u32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB23_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB23_192 +; GFX9-NEXT: .LBB23_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB23_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB23_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_cbranch_execz .LBB23_194 +; GFX9-NEXT: ; %bb.193: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, -1 +; GFX9-NEXT: s_min_u32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 +; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: .LBB23_2: +; GFX9-NEXT: .LBB23_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_min_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_min_u32_e32 v0, s4, v1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: umin_i32_varying: ; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1064-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_2 +; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, -1 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_writelane_b32 v1, -1, 0 +; GFX1064-NEXT: s_branch .LBB23_3 +; GFX1064-NEXT: .LBB23_2: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_3: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s6, s2, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_5 +; GFX1064-NEXT: ; %bb.4: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1064-NEXT: s_branch .LBB23_6 +; GFX1064-NEXT: .LBB23_5: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_6: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_8 +; GFX1064-NEXT: ; %bb.7: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1064-NEXT: s_branch .LBB23_9 +; GFX1064-NEXT: .LBB23_8: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_9: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_11 +; GFX1064-NEXT: ; %bb.10: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1064-NEXT: s_branch .LBB23_12 +; GFX1064-NEXT: .LBB23_11: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_12: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_14 +; GFX1064-NEXT: ; %bb.13: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1064-NEXT: s_branch .LBB23_15 +; GFX1064-NEXT: .LBB23_14: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_15: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_17 +; GFX1064-NEXT: ; %bb.16: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1064-NEXT: s_branch .LBB23_18 +; GFX1064-NEXT: .LBB23_17: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_18: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_20 +; GFX1064-NEXT: ; %bb.19: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1064-NEXT: s_branch .LBB23_21 +; GFX1064-NEXT: .LBB23_20: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_21: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_23 +; GFX1064-NEXT: ; %bb.22: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1064-NEXT: s_branch .LBB23_24 +; GFX1064-NEXT: .LBB23_23: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_24: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_26 +; GFX1064-NEXT: ; %bb.25: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1064-NEXT: s_branch .LBB23_27 +; GFX1064-NEXT: .LBB23_26: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_27: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_29 +; GFX1064-NEXT: ; %bb.28: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1064-NEXT: s_branch .LBB23_30 +; GFX1064-NEXT: .LBB23_29: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_30: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_32 +; GFX1064-NEXT: ; %bb.31: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1064-NEXT: s_branch .LBB23_33 +; GFX1064-NEXT: .LBB23_32: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_33: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_35 +; GFX1064-NEXT: ; %bb.34: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1064-NEXT: s_branch .LBB23_36 +; GFX1064-NEXT: .LBB23_35: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_36: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_38 +; GFX1064-NEXT: ; %bb.37: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1064-NEXT: s_branch .LBB23_39 +; GFX1064-NEXT: .LBB23_38: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_39: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_41 +; GFX1064-NEXT: ; %bb.40: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1064-NEXT: s_branch .LBB23_42 +; GFX1064-NEXT: .LBB23_41: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_42: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_44 +; GFX1064-NEXT: ; %bb.43: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1064-NEXT: s_branch .LBB23_45 +; GFX1064-NEXT: .LBB23_44: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_45: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_47 +; GFX1064-NEXT: ; %bb.46: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1064-NEXT: s_branch .LBB23_48 +; GFX1064-NEXT: .LBB23_47: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_48: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_50 +; GFX1064-NEXT: ; %bb.49: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1064-NEXT: s_branch .LBB23_51 +; GFX1064-NEXT: .LBB23_50: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_51: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_53 +; GFX1064-NEXT: ; %bb.52: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1064-NEXT: s_branch .LBB23_54 +; GFX1064-NEXT: .LBB23_53: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_54: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_56 +; GFX1064-NEXT: ; %bb.55: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1064-NEXT: s_branch .LBB23_57 +; GFX1064-NEXT: .LBB23_56: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_57: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_59 +; GFX1064-NEXT: ; %bb.58: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1064-NEXT: s_branch .LBB23_60 +; GFX1064-NEXT: .LBB23_59: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_60: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_62 +; GFX1064-NEXT: ; %bb.61: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1064-NEXT: s_branch .LBB23_63 +; GFX1064-NEXT: .LBB23_62: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_63: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_65 +; GFX1064-NEXT: ; %bb.64: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1064-NEXT: s_branch .LBB23_66 +; GFX1064-NEXT: .LBB23_65: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_66: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_68 +; GFX1064-NEXT: ; %bb.67: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1064-NEXT: s_branch .LBB23_69 +; GFX1064-NEXT: .LBB23_68: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_69: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_71 +; GFX1064-NEXT: ; %bb.70: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1064-NEXT: s_branch .LBB23_72 +; GFX1064-NEXT: .LBB23_71: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_72: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_74 +; GFX1064-NEXT: ; %bb.73: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1064-NEXT: s_branch .LBB23_75 +; GFX1064-NEXT: .LBB23_74: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_75: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_77 +; GFX1064-NEXT: ; %bb.76: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1064-NEXT: s_branch .LBB23_78 +; GFX1064-NEXT: .LBB23_77: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_78: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_80 +; GFX1064-NEXT: ; %bb.79: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1064-NEXT: s_branch .LBB23_81 +; GFX1064-NEXT: .LBB23_80: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_81: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_83 +; GFX1064-NEXT: ; %bb.82: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1064-NEXT: s_branch .LBB23_84 +; GFX1064-NEXT: .LBB23_83: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_84: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1064-NEXT: s_mov_b32 s3, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_86 +; GFX1064-NEXT: ; %bb.85: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1064-NEXT: s_branch .LBB23_87 +; GFX1064-NEXT: .LBB23_86: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_87: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_89 +; GFX1064-NEXT: ; %bb.88: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1064-NEXT: s_branch .LBB23_90 +; GFX1064-NEXT: .LBB23_89: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_90: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s8, exec_lo, 2.0 +; GFX1064-NEXT: s_min_u32 s4, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1064-NEXT: s_mov_b32 s9, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[8:9], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_92 +; GFX1064-NEXT: ; %bb.91: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1064-NEXT: s_branch .LBB23_93 +; GFX1064-NEXT: .LBB23_92: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_93: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, -1 +; GFX1064-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1064-NEXT: s_min_u32 s4, s4, s2 +; GFX1064-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1064-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1064-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_95 +; GFX1064-NEXT: ; %bb.94: +; GFX1064-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1064-NEXT: s_branch .LBB23_96 +; GFX1064-NEXT: .LBB23_95: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_96: +; GFX1064-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1064-NEXT: s_cselect_b32 s2, s5, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1064-NEXT: s_min_u32 s6, s4, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_98 +; GFX1064-NEXT: ; %bb.97: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1064-NEXT: s_branch .LBB23_99 +; GFX1064-NEXT: .LBB23_98: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_99: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_101 +; GFX1064-NEXT: ; %bb.100: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1064-NEXT: s_branch .LBB23_102 +; GFX1064-NEXT: .LBB23_101: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_102: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_104 +; GFX1064-NEXT: ; %bb.103: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1064-NEXT: s_branch .LBB23_105 +; GFX1064-NEXT: .LBB23_104: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_105: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_107 +; GFX1064-NEXT: ; %bb.106: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1064-NEXT: s_branch .LBB23_108 +; GFX1064-NEXT: .LBB23_107: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_108: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_110 +; GFX1064-NEXT: ; %bb.109: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1064-NEXT: s_branch .LBB23_111 +; GFX1064-NEXT: .LBB23_110: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_111: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_113 +; GFX1064-NEXT: ; %bb.112: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1064-NEXT: s_branch .LBB23_114 +; GFX1064-NEXT: .LBB23_113: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_114: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_116 +; GFX1064-NEXT: ; %bb.115: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1064-NEXT: s_branch .LBB23_117 +; GFX1064-NEXT: .LBB23_116: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_117: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_119 +; GFX1064-NEXT: ; %bb.118: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1064-NEXT: s_branch .LBB23_120 +; GFX1064-NEXT: .LBB23_119: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_120: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_122 +; GFX1064-NEXT: ; %bb.121: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1064-NEXT: s_branch .LBB23_123 +; GFX1064-NEXT: .LBB23_122: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_123: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_125 +; GFX1064-NEXT: ; %bb.124: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1064-NEXT: s_branch .LBB23_126 +; GFX1064-NEXT: .LBB23_125: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_126: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_128 +; GFX1064-NEXT: ; %bb.127: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1064-NEXT: s_branch .LBB23_129 +; GFX1064-NEXT: .LBB23_128: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_129: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_131 +; GFX1064-NEXT: ; %bb.130: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1064-NEXT: s_branch .LBB23_132 +; GFX1064-NEXT: .LBB23_131: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_132: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_134 +; GFX1064-NEXT: ; %bb.133: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1064-NEXT: s_branch .LBB23_135 +; GFX1064-NEXT: .LBB23_134: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_135: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_137 +; GFX1064-NEXT: ; %bb.136: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1064-NEXT: s_branch .LBB23_138 +; GFX1064-NEXT: .LBB23_137: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_138: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_140 +; GFX1064-NEXT: ; %bb.139: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1064-NEXT: s_branch .LBB23_141 +; GFX1064-NEXT: .LBB23_140: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_141: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_143 +; GFX1064-NEXT: ; %bb.142: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1064-NEXT: s_branch .LBB23_144 +; GFX1064-NEXT: .LBB23_143: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_144: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_146 +; GFX1064-NEXT: ; %bb.145: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: s_branch .LBB23_147 +; GFX1064-NEXT: .LBB23_146: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_147: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_149 +; GFX1064-NEXT: ; %bb.148: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1064-NEXT: s_branch .LBB23_150 +; GFX1064-NEXT: .LBB23_149: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_150: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_152 +; GFX1064-NEXT: ; %bb.151: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1064-NEXT: s_branch .LBB23_153 +; GFX1064-NEXT: .LBB23_152: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_153: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_155 +; GFX1064-NEXT: ; %bb.154: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1064-NEXT: s_branch .LBB23_156 +; GFX1064-NEXT: .LBB23_155: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_156: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_158 +; GFX1064-NEXT: ; %bb.157: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1064-NEXT: s_branch .LBB23_159 +; GFX1064-NEXT: .LBB23_158: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_159: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_161 +; GFX1064-NEXT: ; %bb.160: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1064-NEXT: s_branch .LBB23_162 +; GFX1064-NEXT: .LBB23_161: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_162: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_164 +; GFX1064-NEXT: ; %bb.163: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1064-NEXT: s_branch .LBB23_165 +; GFX1064-NEXT: .LBB23_164: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_165: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_167 +; GFX1064-NEXT: ; %bb.166: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1064-NEXT: s_branch .LBB23_168 +; GFX1064-NEXT: .LBB23_167: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_168: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_170 +; GFX1064-NEXT: ; %bb.169: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1064-NEXT: s_branch .LBB23_171 +; GFX1064-NEXT: .LBB23_170: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_171: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_173 +; GFX1064-NEXT: ; %bb.172: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1064-NEXT: s_branch .LBB23_174 +; GFX1064-NEXT: .LBB23_173: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_174: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_176 +; GFX1064-NEXT: ; %bb.175: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1064-NEXT: s_branch .LBB23_177 +; GFX1064-NEXT: .LBB23_176: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_177: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_179 +; GFX1064-NEXT: ; %bb.178: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1064-NEXT: s_branch .LBB23_180 +; GFX1064-NEXT: .LBB23_179: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_180: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_182 +; GFX1064-NEXT: ; %bb.181: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1064-NEXT: s_branch .LBB23_183 +; GFX1064-NEXT: .LBB23_182: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_183: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1064-NEXT: s_min_u32 s6, s6, s4 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_185 +; GFX1064-NEXT: ; %bb.184: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1064-NEXT: s_branch .LBB23_186 +; GFX1064-NEXT: .LBB23_185: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_186: +; GFX1064-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1064-NEXT: s_min_u32 s6, s6, s2 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1064-NEXT: s_mov_b32 s2, 0 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_188 +; GFX1064-NEXT: ; %bb.187: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1064-NEXT: s_branch .LBB23_189 +; GFX1064-NEXT: .LBB23_188: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_189: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1064-NEXT: s_min_u32 s6, s6, s3 +; GFX1064-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1064-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1064-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1064-NEXT: s_cbranch_scc1 .LBB23_191 +; GFX1064-NEXT: ; %bb.190: +; GFX1064-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1064-NEXT: s_branch .LBB23_192 +; GFX1064-NEXT: .LBB23_191: +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: .LBB23_192: +; GFX1064-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB23_2 -; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB23_194 +; GFX1064-NEXT: ; %bb.193: +; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1064-NEXT: s_cselect_b32 s4, s7, -1 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 +; GFX1064-NEXT: s_min_u32 s4, s6, s4 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: .LBB23_2: +; GFX1064-NEXT: .LBB23_194: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: umin_i32_varying: ; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1032-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1032-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_2 +; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, -1 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_writelane_b32 v1, -1, 0 +; GFX1032-NEXT: s_branch .LBB23_3 +; GFX1032-NEXT: .LBB23_2: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_3: +; GFX1032-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1032-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_5 +; GFX1032-NEXT: ; %bb.4: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1032-NEXT: s_branch .LBB23_6 +; GFX1032-NEXT: .LBB23_5: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_6: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_8 +; GFX1032-NEXT: ; %bb.7: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1032-NEXT: s_branch .LBB23_9 +; GFX1032-NEXT: .LBB23_8: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_9: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_11 +; GFX1032-NEXT: ; %bb.10: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1032-NEXT: s_branch .LBB23_12 +; GFX1032-NEXT: .LBB23_11: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_12: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_14 +; GFX1032-NEXT: ; %bb.13: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1032-NEXT: s_branch .LBB23_15 +; GFX1032-NEXT: .LBB23_14: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_15: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_17 +; GFX1032-NEXT: ; %bb.16: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1032-NEXT: s_branch .LBB23_18 +; GFX1032-NEXT: .LBB23_17: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_18: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_20 +; GFX1032-NEXT: ; %bb.19: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1032-NEXT: s_branch .LBB23_21 +; GFX1032-NEXT: .LBB23_20: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_21: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_23 +; GFX1032-NEXT: ; %bb.22: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1032-NEXT: s_branch .LBB23_24 +; GFX1032-NEXT: .LBB23_23: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_24: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_26 +; GFX1032-NEXT: ; %bb.25: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1032-NEXT: s_branch .LBB23_27 +; GFX1032-NEXT: .LBB23_26: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_27: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_29 +; GFX1032-NEXT: ; %bb.28: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1032-NEXT: s_branch .LBB23_30 +; GFX1032-NEXT: .LBB23_29: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_30: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_32 +; GFX1032-NEXT: ; %bb.31: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1032-NEXT: s_branch .LBB23_33 +; GFX1032-NEXT: .LBB23_32: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_33: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_35 +; GFX1032-NEXT: ; %bb.34: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1032-NEXT: s_branch .LBB23_36 +; GFX1032-NEXT: .LBB23_35: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_36: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_38 +; GFX1032-NEXT: ; %bb.37: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1032-NEXT: s_branch .LBB23_39 +; GFX1032-NEXT: .LBB23_38: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_39: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_41 +; GFX1032-NEXT: ; %bb.40: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1032-NEXT: s_branch .LBB23_42 +; GFX1032-NEXT: .LBB23_41: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_42: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_44 +; GFX1032-NEXT: ; %bb.43: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1032-NEXT: s_branch .LBB23_45 +; GFX1032-NEXT: .LBB23_44: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_45: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_47 +; GFX1032-NEXT: ; %bb.46: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1032-NEXT: s_branch .LBB23_48 +; GFX1032-NEXT: .LBB23_47: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_48: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_50 +; GFX1032-NEXT: ; %bb.49: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1032-NEXT: s_branch .LBB23_51 +; GFX1032-NEXT: .LBB23_50: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_51: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_53 +; GFX1032-NEXT: ; %bb.52: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1032-NEXT: s_branch .LBB23_54 +; GFX1032-NEXT: .LBB23_53: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_54: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_56 +; GFX1032-NEXT: ; %bb.55: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1032-NEXT: s_branch .LBB23_57 +; GFX1032-NEXT: .LBB23_56: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_57: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_59 +; GFX1032-NEXT: ; %bb.58: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1032-NEXT: s_branch .LBB23_60 +; GFX1032-NEXT: .LBB23_59: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_60: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_62 +; GFX1032-NEXT: ; %bb.61: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1032-NEXT: s_branch .LBB23_63 +; GFX1032-NEXT: .LBB23_62: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_63: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_65 +; GFX1032-NEXT: ; %bb.64: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1032-NEXT: s_branch .LBB23_66 +; GFX1032-NEXT: .LBB23_65: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_66: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_68 +; GFX1032-NEXT: ; %bb.67: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1032-NEXT: s_branch .LBB23_69 +; GFX1032-NEXT: .LBB23_68: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_69: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_71 +; GFX1032-NEXT: ; %bb.70: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1032-NEXT: s_branch .LBB23_72 +; GFX1032-NEXT: .LBB23_71: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_72: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_74 +; GFX1032-NEXT: ; %bb.73: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1032-NEXT: s_branch .LBB23_75 +; GFX1032-NEXT: .LBB23_74: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_75: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_77 +; GFX1032-NEXT: ; %bb.76: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1032-NEXT: s_branch .LBB23_78 +; GFX1032-NEXT: .LBB23_77: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_78: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_80 +; GFX1032-NEXT: ; %bb.79: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1032-NEXT: s_branch .LBB23_81 +; GFX1032-NEXT: .LBB23_80: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_81: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_83 +; GFX1032-NEXT: ; %bb.82: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1032-NEXT: s_branch .LBB23_84 +; GFX1032-NEXT: .LBB23_83: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_84: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_86 +; GFX1032-NEXT: ; %bb.85: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1032-NEXT: s_branch .LBB23_87 +; GFX1032-NEXT: .LBB23_86: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_87: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_89 +; GFX1032-NEXT: ; %bb.88: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1032-NEXT: s_branch .LBB23_90 +; GFX1032-NEXT: .LBB23_89: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_90: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_92 +; GFX1032-NEXT: ; %bb.91: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1032-NEXT: s_branch .LBB23_93 +; GFX1032-NEXT: .LBB23_92: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_93: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1032-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1032-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1032-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1032-NEXT: s_cbranch_scc1 .LBB23_95 +; GFX1032-NEXT: ; %bb.94: +; GFX1032-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1032-NEXT: s_branch .LBB23_96 +; GFX1032-NEXT: .LBB23_95: +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: .LBB23_96: +; GFX1032-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB23_2 -; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB23_98 +; GFX1032-NEXT: ; %bb.97: +; GFX1032-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1032-NEXT: s_cselect_b32 s3, s3, -1 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 +; GFX1032-NEXT: s_min_u32 s2, s2, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: .LBB23_2: +; GFX1032-NEXT: .LBB23_98: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-LABEL: umin_i32_varying: ; GFX1164: ; %bb.0: ; %entry +; GFX1164-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX1164-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u32 s3, 0 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_2 +; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: v_mov_b32_e32 v1, v0 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: v_mov_b32_e32 v1, -1 -; GFX1164-NEXT: s_not_b64 exec, exec -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_mov_b32_e32 v3, -1 -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, v1 -; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mov_b32_e32 v2, s4 -; GFX1164-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1164-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1164-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1164-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1164-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1164-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1164-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1164-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1164-NEXT: s_mov_b64 exec, s[2:3] -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX1164-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1164-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1164-NEXT: s_mov_b64 exec, s[4:5] -; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1164-NEXT: s_mov_b32 s2, -1 +; GFX1164-NEXT: v_writelane_b32 v1, -1, 0 +; GFX1164-NEXT: s_branch .LBB23_3 +; GFX1164-NEXT: .LBB23_2: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_3: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s6, s2, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 1 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_5 +; GFX1164-NEXT: ; %bb.4: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 1 +; GFX1164-NEXT: s_branch .LBB23_6 +; GFX1164-NEXT: .LBB23_5: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_6: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 2 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_8 +; GFX1164-NEXT: ; %bb.7: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 2 +; GFX1164-NEXT: s_branch .LBB23_9 +; GFX1164-NEXT: .LBB23_8: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_9: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 3 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_11 +; GFX1164-NEXT: ; %bb.10: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 3 +; GFX1164-NEXT: s_branch .LBB23_12 +; GFX1164-NEXT: .LBB23_11: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_12: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 4 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_14 +; GFX1164-NEXT: ; %bb.13: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 4 +; GFX1164-NEXT: s_branch .LBB23_15 +; GFX1164-NEXT: .LBB23_14: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_15: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 5 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_17 +; GFX1164-NEXT: ; %bb.16: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 5 +; GFX1164-NEXT: s_branch .LBB23_18 +; GFX1164-NEXT: .LBB23_17: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_18: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 6 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_20 +; GFX1164-NEXT: ; %bb.19: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 6 +; GFX1164-NEXT: s_branch .LBB23_21 +; GFX1164-NEXT: .LBB23_20: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_21: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 7 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_23 +; GFX1164-NEXT: ; %bb.22: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 7 +; GFX1164-NEXT: s_branch .LBB23_24 +; GFX1164-NEXT: .LBB23_23: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_24: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 8 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_26 +; GFX1164-NEXT: ; %bb.25: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 8 +; GFX1164-NEXT: s_branch .LBB23_27 +; GFX1164-NEXT: .LBB23_26: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_27: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 9 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_29 +; GFX1164-NEXT: ; %bb.28: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 9 +; GFX1164-NEXT: s_branch .LBB23_30 +; GFX1164-NEXT: .LBB23_29: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_30: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 10 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_32 +; GFX1164-NEXT: ; %bb.31: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 10 +; GFX1164-NEXT: s_branch .LBB23_33 +; GFX1164-NEXT: .LBB23_32: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_33: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 11 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_35 +; GFX1164-NEXT: ; %bb.34: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 11 +; GFX1164-NEXT: s_branch .LBB23_36 +; GFX1164-NEXT: .LBB23_35: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_36: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 12 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_38 +; GFX1164-NEXT: ; %bb.37: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 12 +; GFX1164-NEXT: s_branch .LBB23_39 +; GFX1164-NEXT: .LBB23_38: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_39: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 13 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_41 +; GFX1164-NEXT: ; %bb.40: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 13 +; GFX1164-NEXT: s_branch .LBB23_42 +; GFX1164-NEXT: .LBB23_41: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_42: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 14 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_44 +; GFX1164-NEXT: ; %bb.43: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 14 +; GFX1164-NEXT: s_branch .LBB23_45 +; GFX1164-NEXT: .LBB23_44: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_45: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 15 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_47 +; GFX1164-NEXT: ; %bb.46: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 15 +; GFX1164-NEXT: s_branch .LBB23_48 +; GFX1164-NEXT: .LBB23_47: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_48: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 16 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_50 +; GFX1164-NEXT: ; %bb.49: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 16 +; GFX1164-NEXT: s_branch .LBB23_51 +; GFX1164-NEXT: .LBB23_50: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_51: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 17 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_53 +; GFX1164-NEXT: ; %bb.52: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 17 +; GFX1164-NEXT: s_branch .LBB23_54 +; GFX1164-NEXT: .LBB23_53: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_54: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 18 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_56 +; GFX1164-NEXT: ; %bb.55: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 18 +; GFX1164-NEXT: s_branch .LBB23_57 +; GFX1164-NEXT: .LBB23_56: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_57: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 19 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_59 +; GFX1164-NEXT: ; %bb.58: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 19 +; GFX1164-NEXT: s_branch .LBB23_60 +; GFX1164-NEXT: .LBB23_59: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_60: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 20 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_62 +; GFX1164-NEXT: ; %bb.61: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 20 +; GFX1164-NEXT: s_branch .LBB23_63 +; GFX1164-NEXT: .LBB23_62: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_63: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 21 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_65 +; GFX1164-NEXT: ; %bb.64: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 21 +; GFX1164-NEXT: s_branch .LBB23_66 +; GFX1164-NEXT: .LBB23_65: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_66: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 22 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_68 +; GFX1164-NEXT: ; %bb.67: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 22 +; GFX1164-NEXT: s_branch .LBB23_69 +; GFX1164-NEXT: .LBB23_68: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_69: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 23 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_71 +; GFX1164-NEXT: ; %bb.70: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 23 +; GFX1164-NEXT: s_branch .LBB23_72 +; GFX1164-NEXT: .LBB23_71: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_72: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 24 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_74 +; GFX1164-NEXT: ; %bb.73: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 24 +; GFX1164-NEXT: s_branch .LBB23_75 +; GFX1164-NEXT: .LBB23_74: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_75: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 25 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_77 +; GFX1164-NEXT: ; %bb.76: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 25 +; GFX1164-NEXT: s_branch .LBB23_78 +; GFX1164-NEXT: .LBB23_77: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_78: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 26 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_80 +; GFX1164-NEXT: ; %bb.79: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 26 +; GFX1164-NEXT: s_branch .LBB23_81 +; GFX1164-NEXT: .LBB23_80: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_81: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 27 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_83 +; GFX1164-NEXT: ; %bb.82: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 27 +; GFX1164-NEXT: s_branch .LBB23_84 +; GFX1164-NEXT: .LBB23_83: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_84: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 28 +; GFX1164-NEXT: s_mov_b32 s3, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_86 +; GFX1164-NEXT: ; %bb.85: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 28 +; GFX1164-NEXT: s_branch .LBB23_87 +; GFX1164-NEXT: .LBB23_86: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_87: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 29 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_89 +; GFX1164-NEXT: ; %bb.88: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 29 +; GFX1164-NEXT: s_branch .LBB23_90 +; GFX1164-NEXT: .LBB23_89: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_90: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s8, exec_lo, 2.0 +; GFX1164-NEXT: s_min_u32 s4, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 30 +; GFX1164-NEXT: s_mov_b32 s9, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[8:9], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_92 +; GFX1164-NEXT: ; %bb.91: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 30 +; GFX1164-NEXT: s_branch .LBB23_93 +; GFX1164-NEXT: .LBB23_92: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_93: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, -1 +; GFX1164-NEXT: v_readlane_b32 s5, v0, 31 +; GFX1164-NEXT: s_min_u32 s4, s4, s2 +; GFX1164-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1164-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX1164-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_95 +; GFX1164-NEXT: ; %bb.94: +; GFX1164-NEXT: v_writelane_b32 v1, s4, 31 +; GFX1164-NEXT: s_branch .LBB23_96 +; GFX1164-NEXT: .LBB23_95: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_96: +; GFX1164-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX1164-NEXT: s_cselect_b32 s2, s5, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX1164-NEXT: s_min_u32 s6, s4, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 32 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_98 +; GFX1164-NEXT: ; %bb.97: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 32 +; GFX1164-NEXT: s_branch .LBB23_99 +; GFX1164-NEXT: .LBB23_98: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_99: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 33 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_101 +; GFX1164-NEXT: ; %bb.100: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 33 +; GFX1164-NEXT: s_branch .LBB23_102 +; GFX1164-NEXT: .LBB23_101: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_102: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 34 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_104 +; GFX1164-NEXT: ; %bb.103: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 34 +; GFX1164-NEXT: s_branch .LBB23_105 +; GFX1164-NEXT: .LBB23_104: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_105: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 35 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_107 +; GFX1164-NEXT: ; %bb.106: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 35 +; GFX1164-NEXT: s_branch .LBB23_108 +; GFX1164-NEXT: .LBB23_107: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_108: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 36 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_110 +; GFX1164-NEXT: ; %bb.109: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 36 +; GFX1164-NEXT: s_branch .LBB23_111 +; GFX1164-NEXT: .LBB23_110: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_111: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 37 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_113 +; GFX1164-NEXT: ; %bb.112: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 37 +; GFX1164-NEXT: s_branch .LBB23_114 +; GFX1164-NEXT: .LBB23_113: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_114: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 38 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_116 +; GFX1164-NEXT: ; %bb.115: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 38 +; GFX1164-NEXT: s_branch .LBB23_117 +; GFX1164-NEXT: .LBB23_116: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_117: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 39 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_119 +; GFX1164-NEXT: ; %bb.118: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 39 +; GFX1164-NEXT: s_branch .LBB23_120 +; GFX1164-NEXT: .LBB23_119: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_120: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 40 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_122 +; GFX1164-NEXT: ; %bb.121: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 40 +; GFX1164-NEXT: s_branch .LBB23_123 +; GFX1164-NEXT: .LBB23_122: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_123: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 41 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_125 +; GFX1164-NEXT: ; %bb.124: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 41 +; GFX1164-NEXT: s_branch .LBB23_126 +; GFX1164-NEXT: .LBB23_125: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_126: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 42 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_128 +; GFX1164-NEXT: ; %bb.127: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 42 +; GFX1164-NEXT: s_branch .LBB23_129 +; GFX1164-NEXT: .LBB23_128: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_129: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 43 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_131 +; GFX1164-NEXT: ; %bb.130: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 43 +; GFX1164-NEXT: s_branch .LBB23_132 +; GFX1164-NEXT: .LBB23_131: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_132: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 44 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_134 +; GFX1164-NEXT: ; %bb.133: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 44 +; GFX1164-NEXT: s_branch .LBB23_135 +; GFX1164-NEXT: .LBB23_134: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_135: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 45 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_137 +; GFX1164-NEXT: ; %bb.136: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 45 +; GFX1164-NEXT: s_branch .LBB23_138 +; GFX1164-NEXT: .LBB23_137: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_138: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 46 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_140 +; GFX1164-NEXT: ; %bb.139: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 46 +; GFX1164-NEXT: s_branch .LBB23_141 +; GFX1164-NEXT: .LBB23_140: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_141: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 47 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_143 +; GFX1164-NEXT: ; %bb.142: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 47 +; GFX1164-NEXT: s_branch .LBB23_144 +; GFX1164-NEXT: .LBB23_143: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_144: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 48 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_146 +; GFX1164-NEXT: ; %bb.145: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1164-NEXT: s_branch .LBB23_147 +; GFX1164-NEXT: .LBB23_146: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_147: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 49 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_149 +; GFX1164-NEXT: ; %bb.148: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 49 +; GFX1164-NEXT: s_branch .LBB23_150 +; GFX1164-NEXT: .LBB23_149: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_150: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 50 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_152 +; GFX1164-NEXT: ; %bb.151: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 50 +; GFX1164-NEXT: s_branch .LBB23_153 +; GFX1164-NEXT: .LBB23_152: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_153: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 51 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_155 +; GFX1164-NEXT: ; %bb.154: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 51 +; GFX1164-NEXT: s_branch .LBB23_156 +; GFX1164-NEXT: .LBB23_155: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_156: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 52 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_158 +; GFX1164-NEXT: ; %bb.157: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 52 +; GFX1164-NEXT: s_branch .LBB23_159 +; GFX1164-NEXT: .LBB23_158: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_159: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 53 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_161 +; GFX1164-NEXT: ; %bb.160: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 53 +; GFX1164-NEXT: s_branch .LBB23_162 +; GFX1164-NEXT: .LBB23_161: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_162: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 54 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_164 +; GFX1164-NEXT: ; %bb.163: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 54 +; GFX1164-NEXT: s_branch .LBB23_165 +; GFX1164-NEXT: .LBB23_164: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_165: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 55 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_167 +; GFX1164-NEXT: ; %bb.166: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 55 +; GFX1164-NEXT: s_branch .LBB23_168 +; GFX1164-NEXT: .LBB23_167: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_168: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 56 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_170 +; GFX1164-NEXT: ; %bb.169: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 56 +; GFX1164-NEXT: s_branch .LBB23_171 +; GFX1164-NEXT: .LBB23_170: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_171: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 57 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_173 +; GFX1164-NEXT: ; %bb.172: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 57 +; GFX1164-NEXT: s_branch .LBB23_174 +; GFX1164-NEXT: .LBB23_173: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_174: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 58 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_176 +; GFX1164-NEXT: ; %bb.175: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 58 +; GFX1164-NEXT: s_branch .LBB23_177 +; GFX1164-NEXT: .LBB23_176: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_177: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 59 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_179 +; GFX1164-NEXT: ; %bb.178: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 59 +; GFX1164-NEXT: s_branch .LBB23_180 +; GFX1164-NEXT: .LBB23_179: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_180: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 60 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_182 +; GFX1164-NEXT: ; %bb.181: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 60 +; GFX1164-NEXT: s_branch .LBB23_183 +; GFX1164-NEXT: .LBB23_182: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_183: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX1164-NEXT: s_min_u32 s6, s6, s4 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 61 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_185 +; GFX1164-NEXT: ; %bb.184: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 61 +; GFX1164-NEXT: s_branch .LBB23_186 +; GFX1164-NEXT: .LBB23_185: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_186: +; GFX1164-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s2, s7, -1 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_min_u32 s6, s6, s2 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 62 +; GFX1164-NEXT: s_mov_b32 s2, 0 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_188 +; GFX1164-NEXT: ; %bb.187: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 62 +; GFX1164-NEXT: s_branch .LBB23_189 +; GFX1164-NEXT: .LBB23_188: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_189: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s3, s7, -1 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_min_u32 s6, s6, s3 +; GFX1164-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX1164-NEXT: v_readlane_b32 s7, v0, 63 +; GFX1164-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX1164-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1164-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX1164-NEXT: s_cbranch_scc1 .LBB23_191 +; GFX1164-NEXT: ; %bb.190: +; GFX1164-NEXT: v_writelane_b32 v1, s6, 63 +; GFX1164-NEXT: s_branch .LBB23_192 +; GFX1164-NEXT: .LBB23_191: +; GFX1164-NEXT: ; implicit-def: $vgpr1 +; GFX1164-NEXT: .LBB23_192: +; GFX1164-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-NEXT: s_cbranch_execz .LBB23_2 -; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_cbranch_execz .LBB23_194 +; GFX1164-NEXT: ; %bb.193: +; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX1164-NEXT: s_cselect_b32 s4, s7, -1 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0 -; GFX1164-NEXT: v_mov_b32_e32 v4, s7 -; GFX1164-NEXT: s_mov_b32 s3, s7 +; GFX1164-NEXT: s_min_u32 s4, s6, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: v_mov_b32_e32 v2, s4 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1164-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv -; GFX1164-NEXT: .LBB23_2: -; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-NEXT: .LBB23_194: +; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1164-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1164-NEXT: v_mov_b32_e32 v0, v3 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6454,53 +51012,510 @@ ; ; GFX1132-LABEL: umin_i32_varying: ; GFX1132: ; %bb.0: ; %entry +; GFX1132-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX1132-NEXT: v_readlane_b32 s2, v0, 0 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX1132-NEXT: s_cselect_b32 s3, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s4, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_2 +; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v1, v0 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: v_mov_b32_e32 v1, -1 -; GFX1132-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_mov_b32_e32 v3, -1 -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: v_min_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, v1 -; GFX1132-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1132-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1132-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1132-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX1132-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1132-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1132-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1132-NEXT: s_mov_b32 s2, -1 +; GFX1132-NEXT: v_writelane_b32 v1, -1, 0 +; GFX1132-NEXT: s_branch .LBB23_3 +; GFX1132-NEXT: .LBB23_2: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_3: +; GFX1132-NEXT: s_and_b32 s3, s3, exec_lo +; GFX1132-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 1 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_5 +; GFX1132-NEXT: ; %bb.4: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 1 +; GFX1132-NEXT: s_branch .LBB23_6 +; GFX1132-NEXT: .LBB23_5: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_6: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 2 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_8 +; GFX1132-NEXT: ; %bb.7: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 2 +; GFX1132-NEXT: s_branch .LBB23_9 +; GFX1132-NEXT: .LBB23_8: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_9: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 3 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_11 +; GFX1132-NEXT: ; %bb.10: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 3 +; GFX1132-NEXT: s_branch .LBB23_12 +; GFX1132-NEXT: .LBB23_11: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_12: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 4 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_14 +; GFX1132-NEXT: ; %bb.13: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 4 +; GFX1132-NEXT: s_branch .LBB23_15 +; GFX1132-NEXT: .LBB23_14: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_15: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 5 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_17 +; GFX1132-NEXT: ; %bb.16: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 5 +; GFX1132-NEXT: s_branch .LBB23_18 +; GFX1132-NEXT: .LBB23_17: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_18: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 6 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_20 +; GFX1132-NEXT: ; %bb.19: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 6 +; GFX1132-NEXT: s_branch .LBB23_21 +; GFX1132-NEXT: .LBB23_20: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_21: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 7 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_23 +; GFX1132-NEXT: ; %bb.22: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 7 +; GFX1132-NEXT: s_branch .LBB23_24 +; GFX1132-NEXT: .LBB23_23: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_24: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 8 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_26 +; GFX1132-NEXT: ; %bb.25: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 8 +; GFX1132-NEXT: s_branch .LBB23_27 +; GFX1132-NEXT: .LBB23_26: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_27: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 9 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_29 +; GFX1132-NEXT: ; %bb.28: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 9 +; GFX1132-NEXT: s_branch .LBB23_30 +; GFX1132-NEXT: .LBB23_29: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_30: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 10 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_32 +; GFX1132-NEXT: ; %bb.31: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 10 +; GFX1132-NEXT: s_branch .LBB23_33 +; GFX1132-NEXT: .LBB23_32: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_33: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 11 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_35 +; GFX1132-NEXT: ; %bb.34: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 11 +; GFX1132-NEXT: s_branch .LBB23_36 +; GFX1132-NEXT: .LBB23_35: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_36: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 12 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_38 +; GFX1132-NEXT: ; %bb.37: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 12 +; GFX1132-NEXT: s_branch .LBB23_39 +; GFX1132-NEXT: .LBB23_38: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_39: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 13 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_41 +; GFX1132-NEXT: ; %bb.40: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 13 +; GFX1132-NEXT: s_branch .LBB23_42 +; GFX1132-NEXT: .LBB23_41: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_42: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 14 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_44 +; GFX1132-NEXT: ; %bb.43: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 14 +; GFX1132-NEXT: s_branch .LBB23_45 +; GFX1132-NEXT: .LBB23_44: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_45: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 15 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_47 +; GFX1132-NEXT: ; %bb.46: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 15 +; GFX1132-NEXT: s_branch .LBB23_48 +; GFX1132-NEXT: .LBB23_47: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_48: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 16 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_50 +; GFX1132-NEXT: ; %bb.49: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 16 +; GFX1132-NEXT: s_branch .LBB23_51 +; GFX1132-NEXT: .LBB23_50: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_51: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 17 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_53 +; GFX1132-NEXT: ; %bb.52: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 17 +; GFX1132-NEXT: s_branch .LBB23_54 +; GFX1132-NEXT: .LBB23_53: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_54: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 18 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_56 +; GFX1132-NEXT: ; %bb.55: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 18 +; GFX1132-NEXT: s_branch .LBB23_57 +; GFX1132-NEXT: .LBB23_56: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_57: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 19 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_59 +; GFX1132-NEXT: ; %bb.58: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 19 +; GFX1132-NEXT: s_branch .LBB23_60 +; GFX1132-NEXT: .LBB23_59: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_60: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 20 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_62 +; GFX1132-NEXT: ; %bb.61: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 20 +; GFX1132-NEXT: s_branch .LBB23_63 +; GFX1132-NEXT: .LBB23_62: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_63: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 21 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_65 +; GFX1132-NEXT: ; %bb.64: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 21 +; GFX1132-NEXT: s_branch .LBB23_66 +; GFX1132-NEXT: .LBB23_65: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_66: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 22 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_68 +; GFX1132-NEXT: ; %bb.67: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 22 +; GFX1132-NEXT: s_branch .LBB23_69 +; GFX1132-NEXT: .LBB23_68: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_69: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 23 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_71 +; GFX1132-NEXT: ; %bb.70: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 23 +; GFX1132-NEXT: s_branch .LBB23_72 +; GFX1132-NEXT: .LBB23_71: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_72: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 24 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_74 +; GFX1132-NEXT: ; %bb.73: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 24 +; GFX1132-NEXT: s_branch .LBB23_75 +; GFX1132-NEXT: .LBB23_74: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_75: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 25 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_77 +; GFX1132-NEXT: ; %bb.76: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 25 +; GFX1132-NEXT: s_branch .LBB23_78 +; GFX1132-NEXT: .LBB23_77: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_78: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 26 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_80 +; GFX1132-NEXT: ; %bb.79: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 26 +; GFX1132-NEXT: s_branch .LBB23_81 +; GFX1132-NEXT: .LBB23_80: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_81: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 27 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_83 +; GFX1132-NEXT: ; %bb.82: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 27 +; GFX1132-NEXT: s_branch .LBB23_84 +; GFX1132-NEXT: .LBB23_83: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_84: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 28 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_86 +; GFX1132-NEXT: ; %bb.85: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 28 +; GFX1132-NEXT: s_branch .LBB23_87 +; GFX1132-NEXT: .LBB23_86: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_87: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 29 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_89 +; GFX1132-NEXT: ; %bb.88: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 29 +; GFX1132-NEXT: s_branch .LBB23_90 +; GFX1132-NEXT: .LBB23_89: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_90: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 30 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_92 +; GFX1132-NEXT: ; %bb.91: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 30 +; GFX1132-NEXT: s_branch .LBB23_93 +; GFX1132-NEXT: .LBB23_92: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_93: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX1132-NEXT: v_readlane_b32 s3, v0, 31 +; GFX1132-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX1132-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1132-NEXT: s_cmp_eq_u32 s5, 0 +; GFX1132-NEXT: s_cbranch_scc1 .LBB23_95 +; GFX1132-NEXT: ; %bb.94: +; GFX1132-NEXT: v_writelane_b32 v1, s2, 31 +; GFX1132-NEXT: s_branch .LBB23_96 +; GFX1132-NEXT: .LBB23_95: +; GFX1132-NEXT: ; implicit-def: $vgpr1 +; GFX1132-NEXT: .LBB23_96: +; GFX1132-NEXT: v_readfirstlane_b32 s5, v2 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1132-NEXT: s_cbranch_execz .LBB23_2 -; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX1132-NEXT: s_cbranch_execz .LBB23_98 +; GFX1132-NEXT: ; %bb.97: +; GFX1132-NEXT: s_and_b32 s4, s4, exec_lo +; GFX1132-NEXT: s_cselect_b32 s3, s3, -1 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0 -; GFX1132-NEXT: v_mov_b32_e32 v4, s4 +; GFX1132-NEXT: s_min_u32 s2, s2, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_mov_b32_e32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v4 +; GFX1132-NEXT: ds_min_rtn_u32 v0, v0, v2 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv -; GFX1132-NEXT: .LBB23_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-NEXT: .LBB23_98: +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1132-NEXT: v_mov_b32_e32 v0, v3 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_min_u32_e32 v0, s3, v0 +; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1 +; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -448,313 +448,5026 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB2_3 +; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB2_6 +; GFX8-NEXT: .LBB2_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB2_9 +; GFX8-NEXT: .LBB2_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB2_12 +; GFX8-NEXT: .LBB2_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB2_15 +; GFX8-NEXT: .LBB2_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB2_18 +; GFX8-NEXT: .LBB2_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB2_21 +; GFX8-NEXT: .LBB2_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB2_24 +; GFX8-NEXT: .LBB2_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB2_27 +; GFX8-NEXT: .LBB2_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB2_30 +; GFX8-NEXT: .LBB2_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB2_33 +; GFX8-NEXT: .LBB2_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB2_36 +; GFX8-NEXT: .LBB2_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB2_39 +; GFX8-NEXT: .LBB2_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB2_42 +; GFX8-NEXT: .LBB2_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB2_45 +; GFX8-NEXT: .LBB2_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB2_48 +; GFX8-NEXT: .LBB2_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB2_51 +; GFX8-NEXT: .LBB2_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB2_54 +; GFX8-NEXT: .LBB2_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB2_57 +; GFX8-NEXT: .LBB2_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB2_60 +; GFX8-NEXT: .LBB2_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB2_63 +; GFX8-NEXT: .LBB2_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB2_66 +; GFX8-NEXT: .LBB2_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB2_69 +; GFX8-NEXT: .LBB2_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB2_72 +; GFX8-NEXT: .LBB2_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB2_75 +; GFX8-NEXT: .LBB2_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB2_78 +; GFX8-NEXT: .LBB2_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB2_81 +; GFX8-NEXT: .LBB2_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB2_84 +; GFX8-NEXT: .LBB2_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB2_87 +; GFX8-NEXT: .LBB2_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB2_90 +; GFX8-NEXT: .LBB2_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB2_93 +; GFX8-NEXT: .LBB2_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB2_96 +; GFX8-NEXT: .LBB2_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB2_99 +; GFX8-NEXT: .LBB2_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB2_102 +; GFX8-NEXT: .LBB2_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB2_105 +; GFX8-NEXT: .LBB2_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB2_108 +; GFX8-NEXT: .LBB2_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB2_111 +; GFX8-NEXT: .LBB2_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB2_114 +; GFX8-NEXT: .LBB2_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB2_117 +; GFX8-NEXT: .LBB2_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB2_120 +; GFX8-NEXT: .LBB2_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB2_123 +; GFX8-NEXT: .LBB2_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB2_126 +; GFX8-NEXT: .LBB2_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB2_129 +; GFX8-NEXT: .LBB2_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB2_132 +; GFX8-NEXT: .LBB2_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB2_135 +; GFX8-NEXT: .LBB2_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB2_138 +; GFX8-NEXT: .LBB2_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB2_141 +; GFX8-NEXT: .LBB2_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB2_144 +; GFX8-NEXT: .LBB2_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB2_147 +; GFX8-NEXT: .LBB2_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB2_150 +; GFX8-NEXT: .LBB2_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB2_153 +; GFX8-NEXT: .LBB2_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB2_156 +; GFX8-NEXT: .LBB2_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB2_159 +; GFX8-NEXT: .LBB2_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB2_162 +; GFX8-NEXT: .LBB2_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB2_165 +; GFX8-NEXT: .LBB2_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB2_168 +; GFX8-NEXT: .LBB2_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB2_171 +; GFX8-NEXT: .LBB2_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB2_174 +; GFX8-NEXT: .LBB2_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB2_177 +; GFX8-NEXT: .LBB2_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB2_180 +; GFX8-NEXT: .LBB2_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB2_183 +; GFX8-NEXT: .LBB2_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB2_186 +; GFX8-NEXT: .LBB2_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB2_189 +; GFX8-NEXT: .LBB2_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB2_192 +; GFX8-NEXT: .LBB2_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_cbranch_execz .LBB2_194 +; GFX8-NEXT: ; %bb.193: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: .LBB2_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB2_3 +; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB2_6 +; GFX9-NEXT: .LBB2_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB2_9 +; GFX9-NEXT: .LBB2_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB2_12 +; GFX9-NEXT: .LBB2_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB2_15 +; GFX9-NEXT: .LBB2_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB2_18 +; GFX9-NEXT: .LBB2_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB2_21 +; GFX9-NEXT: .LBB2_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB2_24 +; GFX9-NEXT: .LBB2_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB2_27 +; GFX9-NEXT: .LBB2_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB2_30 +; GFX9-NEXT: .LBB2_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB2_33 +; GFX9-NEXT: .LBB2_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB2_36 +; GFX9-NEXT: .LBB2_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB2_39 +; GFX9-NEXT: .LBB2_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB2_42 +; GFX9-NEXT: .LBB2_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB2_45 +; GFX9-NEXT: .LBB2_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB2_48 +; GFX9-NEXT: .LBB2_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB2_51 +; GFX9-NEXT: .LBB2_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB2_54 +; GFX9-NEXT: .LBB2_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB2_57 +; GFX9-NEXT: .LBB2_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB2_60 +; GFX9-NEXT: .LBB2_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB2_63 +; GFX9-NEXT: .LBB2_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB2_66 +; GFX9-NEXT: .LBB2_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB2_69 +; GFX9-NEXT: .LBB2_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB2_72 +; GFX9-NEXT: .LBB2_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB2_75 +; GFX9-NEXT: .LBB2_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB2_78 +; GFX9-NEXT: .LBB2_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB2_81 +; GFX9-NEXT: .LBB2_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB2_84 +; GFX9-NEXT: .LBB2_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB2_87 +; GFX9-NEXT: .LBB2_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB2_90 +; GFX9-NEXT: .LBB2_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB2_93 +; GFX9-NEXT: .LBB2_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB2_96 +; GFX9-NEXT: .LBB2_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB2_99 +; GFX9-NEXT: .LBB2_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB2_102 +; GFX9-NEXT: .LBB2_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB2_105 +; GFX9-NEXT: .LBB2_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB2_108 +; GFX9-NEXT: .LBB2_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB2_111 +; GFX9-NEXT: .LBB2_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB2_114 +; GFX9-NEXT: .LBB2_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB2_117 +; GFX9-NEXT: .LBB2_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB2_120 +; GFX9-NEXT: .LBB2_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB2_123 +; GFX9-NEXT: .LBB2_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB2_126 +; GFX9-NEXT: .LBB2_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB2_129 +; GFX9-NEXT: .LBB2_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB2_132 +; GFX9-NEXT: .LBB2_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB2_135 +; GFX9-NEXT: .LBB2_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB2_138 +; GFX9-NEXT: .LBB2_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB2_141 +; GFX9-NEXT: .LBB2_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB2_144 +; GFX9-NEXT: .LBB2_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB2_147 +; GFX9-NEXT: .LBB2_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB2_150 +; GFX9-NEXT: .LBB2_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB2_153 +; GFX9-NEXT: .LBB2_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB2_156 +; GFX9-NEXT: .LBB2_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB2_159 +; GFX9-NEXT: .LBB2_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB2_162 +; GFX9-NEXT: .LBB2_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB2_165 +; GFX9-NEXT: .LBB2_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB2_168 +; GFX9-NEXT: .LBB2_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB2_171 +; GFX9-NEXT: .LBB2_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB2_174 +; GFX9-NEXT: .LBB2_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB2_177 +; GFX9-NEXT: .LBB2_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB2_180 +; GFX9-NEXT: .LBB2_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB2_183 +; GFX9-NEXT: .LBB2_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB2_186 +; GFX9-NEXT: .LBB2_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB2_189 +; GFX9-NEXT: .LBB2_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB2_192 +; GFX9-NEXT: .LBB2_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_cbranch_execz .LBB2_194 +; GFX9-NEXT: ; %bb.193: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: .LBB2_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX10W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W64-NEXT: s_branch .LBB2_3 +; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_3: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX10W64-NEXT: ; %bb.4: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX10W64-NEXT: s_branch .LBB2_6 +; GFX10W64-NEXT: .LBB2_5: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_6: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX10W64-NEXT: ; %bb.7: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX10W64-NEXT: s_branch .LBB2_9 +; GFX10W64-NEXT: .LBB2_8: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_9: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX10W64-NEXT: ; %bb.10: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX10W64-NEXT: s_branch .LBB2_12 +; GFX10W64-NEXT: .LBB2_11: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_12: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX10W64-NEXT: ; %bb.13: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX10W64-NEXT: s_branch .LBB2_15 +; GFX10W64-NEXT: .LBB2_14: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_15: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX10W64-NEXT: ; %bb.16: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX10W64-NEXT: s_branch .LBB2_18 +; GFX10W64-NEXT: .LBB2_17: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_18: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX10W64-NEXT: ; %bb.19: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX10W64-NEXT: s_branch .LBB2_21 +; GFX10W64-NEXT: .LBB2_20: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_21: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX10W64-NEXT: ; %bb.22: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX10W64-NEXT: s_branch .LBB2_24 +; GFX10W64-NEXT: .LBB2_23: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_24: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX10W64-NEXT: ; %bb.25: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX10W64-NEXT: s_branch .LBB2_27 +; GFX10W64-NEXT: .LBB2_26: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_27: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX10W64-NEXT: ; %bb.28: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX10W64-NEXT: s_branch .LBB2_30 +; GFX10W64-NEXT: .LBB2_29: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_30: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX10W64-NEXT: ; %bb.31: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX10W64-NEXT: s_branch .LBB2_33 +; GFX10W64-NEXT: .LBB2_32: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_33: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX10W64-NEXT: ; %bb.34: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX10W64-NEXT: s_branch .LBB2_36 +; GFX10W64-NEXT: .LBB2_35: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_36: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX10W64-NEXT: ; %bb.37: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX10W64-NEXT: s_branch .LBB2_39 +; GFX10W64-NEXT: .LBB2_38: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_39: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX10W64-NEXT: ; %bb.40: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX10W64-NEXT: s_branch .LBB2_42 +; GFX10W64-NEXT: .LBB2_41: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_42: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX10W64-NEXT: ; %bb.43: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX10W64-NEXT: s_branch .LBB2_45 +; GFX10W64-NEXT: .LBB2_44: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_45: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX10W64-NEXT: ; %bb.46: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX10W64-NEXT: s_branch .LBB2_48 +; GFX10W64-NEXT: .LBB2_47: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_48: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX10W64-NEXT: ; %bb.49: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX10W64-NEXT: s_branch .LBB2_51 +; GFX10W64-NEXT: .LBB2_50: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_51: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX10W64-NEXT: ; %bb.52: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX10W64-NEXT: s_branch .LBB2_54 +; GFX10W64-NEXT: .LBB2_53: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_54: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX10W64-NEXT: ; %bb.55: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX10W64-NEXT: s_branch .LBB2_57 +; GFX10W64-NEXT: .LBB2_56: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_57: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX10W64-NEXT: ; %bb.58: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX10W64-NEXT: s_branch .LBB2_60 +; GFX10W64-NEXT: .LBB2_59: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_60: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX10W64-NEXT: ; %bb.61: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX10W64-NEXT: s_branch .LBB2_63 +; GFX10W64-NEXT: .LBB2_62: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_63: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX10W64-NEXT: ; %bb.64: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX10W64-NEXT: s_branch .LBB2_66 +; GFX10W64-NEXT: .LBB2_65: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_66: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX10W64-NEXT: ; %bb.67: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX10W64-NEXT: s_branch .LBB2_69 +; GFX10W64-NEXT: .LBB2_68: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_69: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX10W64-NEXT: ; %bb.70: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX10W64-NEXT: s_branch .LBB2_72 +; GFX10W64-NEXT: .LBB2_71: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_72: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX10W64-NEXT: ; %bb.73: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX10W64-NEXT: s_branch .LBB2_75 +; GFX10W64-NEXT: .LBB2_74: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_75: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX10W64-NEXT: ; %bb.76: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX10W64-NEXT: s_branch .LBB2_78 +; GFX10W64-NEXT: .LBB2_77: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_78: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX10W64-NEXT: ; %bb.79: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX10W64-NEXT: s_branch .LBB2_81 +; GFX10W64-NEXT: .LBB2_80: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_81: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX10W64-NEXT: ; %bb.82: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX10W64-NEXT: s_branch .LBB2_84 +; GFX10W64-NEXT: .LBB2_83: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_84: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX10W64-NEXT: ; %bb.85: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX10W64-NEXT: s_branch .LBB2_87 +; GFX10W64-NEXT: .LBB2_86: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_87: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX10W64-NEXT: ; %bb.88: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX10W64-NEXT: s_branch .LBB2_90 +; GFX10W64-NEXT: .LBB2_89: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_90: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX10W64-NEXT: s_add_i32 s4, s6, s2 +; GFX10W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX10W64-NEXT: s_mov_b32 s7, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX10W64-NEXT: ; %bb.91: +; GFX10W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX10W64-NEXT: s_branch .LBB2_93 +; GFX10W64-NEXT: .LBB2_92: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_93: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s5, 0 +; GFX10W64-NEXT: s_add_i32 s6, s4, s2 +; GFX10W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX10W64-NEXT: ; %bb.94: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX10W64-NEXT: s_branch .LBB2_96 +; GFX10W64-NEXT: .LBB2_95: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_96: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX10W64-NEXT: ; %bb.97: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX10W64-NEXT: s_branch .LBB2_99 +; GFX10W64-NEXT: .LBB2_98: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_99: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX10W64-NEXT: ; %bb.100: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX10W64-NEXT: s_branch .LBB2_102 +; GFX10W64-NEXT: .LBB2_101: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_102: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX10W64-NEXT: ; %bb.103: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX10W64-NEXT: s_branch .LBB2_105 +; GFX10W64-NEXT: .LBB2_104: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_105: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX10W64-NEXT: ; %bb.106: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX10W64-NEXT: s_branch .LBB2_108 +; GFX10W64-NEXT: .LBB2_107: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_108: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX10W64-NEXT: ; %bb.109: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX10W64-NEXT: s_branch .LBB2_111 +; GFX10W64-NEXT: .LBB2_110: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_111: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX10W64-NEXT: ; %bb.112: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX10W64-NEXT: s_branch .LBB2_114 +; GFX10W64-NEXT: .LBB2_113: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_114: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX10W64-NEXT: ; %bb.115: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX10W64-NEXT: s_branch .LBB2_117 +; GFX10W64-NEXT: .LBB2_116: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_117: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX10W64-NEXT: ; %bb.118: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX10W64-NEXT: s_branch .LBB2_120 +; GFX10W64-NEXT: .LBB2_119: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_120: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX10W64-NEXT: ; %bb.121: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX10W64-NEXT: s_branch .LBB2_123 +; GFX10W64-NEXT: .LBB2_122: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_123: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX10W64-NEXT: ; %bb.124: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX10W64-NEXT: s_branch .LBB2_126 +; GFX10W64-NEXT: .LBB2_125: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_126: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX10W64-NEXT: ; %bb.127: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX10W64-NEXT: s_branch .LBB2_129 +; GFX10W64-NEXT: .LBB2_128: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_129: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX10W64-NEXT: ; %bb.130: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX10W64-NEXT: s_branch .LBB2_132 +; GFX10W64-NEXT: .LBB2_131: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_132: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX10W64-NEXT: ; %bb.133: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX10W64-NEXT: s_branch .LBB2_135 +; GFX10W64-NEXT: .LBB2_134: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_135: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX10W64-NEXT: ; %bb.136: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX10W64-NEXT: s_branch .LBB2_138 +; GFX10W64-NEXT: .LBB2_137: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_138: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX10W64-NEXT: ; %bb.139: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX10W64-NEXT: s_branch .LBB2_141 +; GFX10W64-NEXT: .LBB2_140: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_141: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX10W64-NEXT: ; %bb.142: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX10W64-NEXT: s_branch .LBB2_144 +; GFX10W64-NEXT: .LBB2_143: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_144: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX10W64-NEXT: ; %bb.145: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX10W64-NEXT: s_branch .LBB2_147 +; GFX10W64-NEXT: .LBB2_146: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_147: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX10W64-NEXT: ; %bb.148: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX10W64-NEXT: s_branch .LBB2_150 +; GFX10W64-NEXT: .LBB2_149: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_150: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX10W64-NEXT: ; %bb.151: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX10W64-NEXT: s_branch .LBB2_153 +; GFX10W64-NEXT: .LBB2_152: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_153: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX10W64-NEXT: ; %bb.154: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX10W64-NEXT: s_branch .LBB2_156 +; GFX10W64-NEXT: .LBB2_155: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_156: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX10W64-NEXT: ; %bb.157: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX10W64-NEXT: s_branch .LBB2_159 +; GFX10W64-NEXT: .LBB2_158: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_159: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX10W64-NEXT: ; %bb.160: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX10W64-NEXT: s_branch .LBB2_162 +; GFX10W64-NEXT: .LBB2_161: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_162: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX10W64-NEXT: ; %bb.163: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX10W64-NEXT: s_branch .LBB2_165 +; GFX10W64-NEXT: .LBB2_164: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_165: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX10W64-NEXT: ; %bb.166: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX10W64-NEXT: s_branch .LBB2_168 +; GFX10W64-NEXT: .LBB2_167: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_168: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX10W64-NEXT: ; %bb.169: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX10W64-NEXT: s_branch .LBB2_171 +; GFX10W64-NEXT: .LBB2_170: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_171: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX10W64-NEXT: ; %bb.172: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX10W64-NEXT: s_branch .LBB2_174 +; GFX10W64-NEXT: .LBB2_173: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_174: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX10W64-NEXT: ; %bb.175: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX10W64-NEXT: s_branch .LBB2_177 +; GFX10W64-NEXT: .LBB2_176: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_177: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX10W64-NEXT: ; %bb.178: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX10W64-NEXT: s_branch .LBB2_180 +; GFX10W64-NEXT: .LBB2_179: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_180: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX10W64-NEXT: ; %bb.181: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX10W64-NEXT: s_branch .LBB2_183 +; GFX10W64-NEXT: .LBB2_182: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_183: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX10W64-NEXT: ; %bb.184: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX10W64-NEXT: s_branch .LBB2_186 +; GFX10W64-NEXT: .LBB2_185: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_186: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX10W64-NEXT: ; %bb.187: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX10W64-NEXT: s_branch .LBB2_189 +; GFX10W64-NEXT: .LBB2_188: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_189: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX10W64-NEXT: ; %bb.190: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX10W64-NEXT: s_branch .LBB2_192 +; GFX10W64-NEXT: .LBB2_191: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_192: +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_cbranch_execz .LBB2_194 +; GFX10W64-NEXT: ; %bb.193: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_add_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB2_194: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 +; GFX10W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX10W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, off, s[4:7], 0 glc +; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W32-NEXT: s_branch .LBB2_3 ; GFX10W32-NEXT: .LBB2_2: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_3: +; GFX10W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX10W32-NEXT: ; %bb.4: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX10W32-NEXT: s_branch .LBB2_6 +; GFX10W32-NEXT: .LBB2_5: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_6: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX10W32-NEXT: ; %bb.7: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX10W32-NEXT: s_branch .LBB2_9 +; GFX10W32-NEXT: .LBB2_8: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_9: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX10W32-NEXT: ; %bb.10: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX10W32-NEXT: s_branch .LBB2_12 +; GFX10W32-NEXT: .LBB2_11: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_12: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX10W32-NEXT: ; %bb.13: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX10W32-NEXT: s_branch .LBB2_15 +; GFX10W32-NEXT: .LBB2_14: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_15: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX10W32-NEXT: ; %bb.16: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX10W32-NEXT: s_branch .LBB2_18 +; GFX10W32-NEXT: .LBB2_17: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_18: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX10W32-NEXT: ; %bb.19: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX10W32-NEXT: s_branch .LBB2_21 +; GFX10W32-NEXT: .LBB2_20: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_21: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX10W32-NEXT: ; %bb.22: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX10W32-NEXT: s_branch .LBB2_24 +; GFX10W32-NEXT: .LBB2_23: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_24: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX10W32-NEXT: ; %bb.25: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX10W32-NEXT: s_branch .LBB2_27 +; GFX10W32-NEXT: .LBB2_26: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_27: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX10W32-NEXT: ; %bb.28: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX10W32-NEXT: s_branch .LBB2_30 +; GFX10W32-NEXT: .LBB2_29: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_30: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX10W32-NEXT: ; %bb.31: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX10W32-NEXT: s_branch .LBB2_33 +; GFX10W32-NEXT: .LBB2_32: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_33: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX10W32-NEXT: ; %bb.34: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX10W32-NEXT: s_branch .LBB2_36 +; GFX10W32-NEXT: .LBB2_35: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_36: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX10W32-NEXT: ; %bb.37: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX10W32-NEXT: s_branch .LBB2_39 +; GFX10W32-NEXT: .LBB2_38: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_39: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX10W32-NEXT: ; %bb.40: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX10W32-NEXT: s_branch .LBB2_42 +; GFX10W32-NEXT: .LBB2_41: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_42: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX10W32-NEXT: ; %bb.43: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX10W32-NEXT: s_branch .LBB2_45 +; GFX10W32-NEXT: .LBB2_44: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_45: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX10W32-NEXT: ; %bb.46: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX10W32-NEXT: s_branch .LBB2_48 +; GFX10W32-NEXT: .LBB2_47: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_48: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX10W32-NEXT: ; %bb.49: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX10W32-NEXT: s_branch .LBB2_51 +; GFX10W32-NEXT: .LBB2_50: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_51: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX10W32-NEXT: ; %bb.52: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX10W32-NEXT: s_branch .LBB2_54 +; GFX10W32-NEXT: .LBB2_53: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_54: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX10W32-NEXT: ; %bb.55: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX10W32-NEXT: s_branch .LBB2_57 +; GFX10W32-NEXT: .LBB2_56: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_57: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX10W32-NEXT: ; %bb.58: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX10W32-NEXT: s_branch .LBB2_60 +; GFX10W32-NEXT: .LBB2_59: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_60: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX10W32-NEXT: ; %bb.61: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX10W32-NEXT: s_branch .LBB2_63 +; GFX10W32-NEXT: .LBB2_62: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_63: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX10W32-NEXT: ; %bb.64: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX10W32-NEXT: s_branch .LBB2_66 +; GFX10W32-NEXT: .LBB2_65: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_66: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX10W32-NEXT: ; %bb.67: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX10W32-NEXT: s_branch .LBB2_69 +; GFX10W32-NEXT: .LBB2_68: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_69: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX10W32-NEXT: ; %bb.70: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX10W32-NEXT: s_branch .LBB2_72 +; GFX10W32-NEXT: .LBB2_71: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_72: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX10W32-NEXT: ; %bb.73: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX10W32-NEXT: s_branch .LBB2_75 +; GFX10W32-NEXT: .LBB2_74: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_75: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX10W32-NEXT: ; %bb.76: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX10W32-NEXT: s_branch .LBB2_78 +; GFX10W32-NEXT: .LBB2_77: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_78: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX10W32-NEXT: ; %bb.79: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX10W32-NEXT: s_branch .LBB2_81 +; GFX10W32-NEXT: .LBB2_80: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_81: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX10W32-NEXT: ; %bb.82: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX10W32-NEXT: s_branch .LBB2_84 +; GFX10W32-NEXT: .LBB2_83: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_84: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX10W32-NEXT: ; %bb.85: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX10W32-NEXT: s_branch .LBB2_87 +; GFX10W32-NEXT: .LBB2_86: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_87: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX10W32-NEXT: ; %bb.88: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX10W32-NEXT: s_branch .LBB2_90 +; GFX10W32-NEXT: .LBB2_89: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_90: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX10W32-NEXT: ; %bb.91: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX10W32-NEXT: s_branch .LBB2_93 +; GFX10W32-NEXT: .LBB2_92: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_93: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX10W32-NEXT: ; %bb.94: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX10W32-NEXT: s_branch .LBB2_96 +; GFX10W32-NEXT: .LBB2_95: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_96: +; GFX10W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_cbranch_execz .LBB2_98 +; GFX10W32-NEXT: ; %bb.97: +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W32-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: .LBB2_98: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX11W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W64-NEXT: s_branch .LBB2_3 +; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_3: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX11W64-NEXT: ; %bb.4: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX11W64-NEXT: s_branch .LBB2_6 +; GFX11W64-NEXT: .LBB2_5: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_6: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX11W64-NEXT: ; %bb.7: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX11W64-NEXT: s_branch .LBB2_9 +; GFX11W64-NEXT: .LBB2_8: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_9: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX11W64-NEXT: ; %bb.10: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX11W64-NEXT: s_branch .LBB2_12 +; GFX11W64-NEXT: .LBB2_11: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_12: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX11W64-NEXT: ; %bb.13: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX11W64-NEXT: s_branch .LBB2_15 +; GFX11W64-NEXT: .LBB2_14: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_15: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX11W64-NEXT: ; %bb.16: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX11W64-NEXT: s_branch .LBB2_18 +; GFX11W64-NEXT: .LBB2_17: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_18: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX11W64-NEXT: ; %bb.19: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX11W64-NEXT: s_branch .LBB2_21 +; GFX11W64-NEXT: .LBB2_20: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_21: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX11W64-NEXT: ; %bb.22: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX11W64-NEXT: s_branch .LBB2_24 +; GFX11W64-NEXT: .LBB2_23: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_24: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX11W64-NEXT: ; %bb.25: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX11W64-NEXT: s_branch .LBB2_27 +; GFX11W64-NEXT: .LBB2_26: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_27: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX11W64-NEXT: ; %bb.28: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX11W64-NEXT: s_branch .LBB2_30 +; GFX11W64-NEXT: .LBB2_29: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_30: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX11W64-NEXT: ; %bb.31: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX11W64-NEXT: s_branch .LBB2_33 +; GFX11W64-NEXT: .LBB2_32: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_33: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX11W64-NEXT: ; %bb.34: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX11W64-NEXT: s_branch .LBB2_36 +; GFX11W64-NEXT: .LBB2_35: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_36: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX11W64-NEXT: ; %bb.37: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX11W64-NEXT: s_branch .LBB2_39 +; GFX11W64-NEXT: .LBB2_38: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_39: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX11W64-NEXT: ; %bb.40: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX11W64-NEXT: s_branch .LBB2_42 +; GFX11W64-NEXT: .LBB2_41: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_42: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX11W64-NEXT: ; %bb.43: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX11W64-NEXT: s_branch .LBB2_45 +; GFX11W64-NEXT: .LBB2_44: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_45: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX11W64-NEXT: ; %bb.46: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX11W64-NEXT: s_branch .LBB2_48 +; GFX11W64-NEXT: .LBB2_47: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_48: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX11W64-NEXT: ; %bb.49: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX11W64-NEXT: s_branch .LBB2_51 +; GFX11W64-NEXT: .LBB2_50: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_51: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX11W64-NEXT: ; %bb.52: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX11W64-NEXT: s_branch .LBB2_54 +; GFX11W64-NEXT: .LBB2_53: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_54: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX11W64-NEXT: ; %bb.55: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX11W64-NEXT: s_branch .LBB2_57 +; GFX11W64-NEXT: .LBB2_56: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_57: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX11W64-NEXT: ; %bb.58: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX11W64-NEXT: s_branch .LBB2_60 +; GFX11W64-NEXT: .LBB2_59: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_60: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX11W64-NEXT: ; %bb.61: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX11W64-NEXT: s_branch .LBB2_63 +; GFX11W64-NEXT: .LBB2_62: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_63: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX11W64-NEXT: ; %bb.64: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX11W64-NEXT: s_branch .LBB2_66 +; GFX11W64-NEXT: .LBB2_65: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_66: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX11W64-NEXT: ; %bb.67: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX11W64-NEXT: s_branch .LBB2_69 +; GFX11W64-NEXT: .LBB2_68: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_69: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX11W64-NEXT: ; %bb.70: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX11W64-NEXT: s_branch .LBB2_72 +; GFX11W64-NEXT: .LBB2_71: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_72: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX11W64-NEXT: ; %bb.73: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX11W64-NEXT: s_branch .LBB2_75 +; GFX11W64-NEXT: .LBB2_74: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_75: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX11W64-NEXT: ; %bb.76: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX11W64-NEXT: s_branch .LBB2_78 +; GFX11W64-NEXT: .LBB2_77: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_78: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX11W64-NEXT: ; %bb.79: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX11W64-NEXT: s_branch .LBB2_81 +; GFX11W64-NEXT: .LBB2_80: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_81: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX11W64-NEXT: ; %bb.82: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX11W64-NEXT: s_branch .LBB2_84 +; GFX11W64-NEXT: .LBB2_83: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_84: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX11W64-NEXT: ; %bb.85: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX11W64-NEXT: s_branch .LBB2_87 +; GFX11W64-NEXT: .LBB2_86: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_87: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX11W64-NEXT: ; %bb.88: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX11W64-NEXT: s_branch .LBB2_90 +; GFX11W64-NEXT: .LBB2_89: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_90: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX11W64-NEXT: s_add_i32 s4, s6, s2 +; GFX11W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX11W64-NEXT: s_mov_b32 s7, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX11W64-NEXT: ; %bb.91: +; GFX11W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX11W64-NEXT: s_branch .LBB2_93 +; GFX11W64-NEXT: .LBB2_92: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_93: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s5, 0 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: s_add_i32 s6, s4, s2 +; GFX11W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX11W64-NEXT: ; %bb.94: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX11W64-NEXT: s_branch .LBB2_96 +; GFX11W64-NEXT: .LBB2_95: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_96: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX11W64-NEXT: ; %bb.97: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX11W64-NEXT: s_branch .LBB2_99 +; GFX11W64-NEXT: .LBB2_98: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_99: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX11W64-NEXT: ; %bb.100: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX11W64-NEXT: s_branch .LBB2_102 +; GFX11W64-NEXT: .LBB2_101: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_102: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX11W64-NEXT: ; %bb.103: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX11W64-NEXT: s_branch .LBB2_105 +; GFX11W64-NEXT: .LBB2_104: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_105: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX11W64-NEXT: ; %bb.106: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX11W64-NEXT: s_branch .LBB2_108 +; GFX11W64-NEXT: .LBB2_107: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_108: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX11W64-NEXT: ; %bb.109: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX11W64-NEXT: s_branch .LBB2_111 +; GFX11W64-NEXT: .LBB2_110: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_111: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX11W64-NEXT: ; %bb.112: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX11W64-NEXT: s_branch .LBB2_114 +; GFX11W64-NEXT: .LBB2_113: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_114: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX11W64-NEXT: ; %bb.115: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX11W64-NEXT: s_branch .LBB2_117 +; GFX11W64-NEXT: .LBB2_116: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_117: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX11W64-NEXT: ; %bb.118: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX11W64-NEXT: s_branch .LBB2_120 +; GFX11W64-NEXT: .LBB2_119: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_120: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX11W64-NEXT: ; %bb.121: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX11W64-NEXT: s_branch .LBB2_123 +; GFX11W64-NEXT: .LBB2_122: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_123: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX11W64-NEXT: ; %bb.124: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX11W64-NEXT: s_branch .LBB2_126 +; GFX11W64-NEXT: .LBB2_125: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_126: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX11W64-NEXT: ; %bb.127: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX11W64-NEXT: s_branch .LBB2_129 +; GFX11W64-NEXT: .LBB2_128: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_129: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX11W64-NEXT: ; %bb.130: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX11W64-NEXT: s_branch .LBB2_132 +; GFX11W64-NEXT: .LBB2_131: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_132: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX11W64-NEXT: ; %bb.133: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX11W64-NEXT: s_branch .LBB2_135 +; GFX11W64-NEXT: .LBB2_134: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_135: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX11W64-NEXT: ; %bb.136: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX11W64-NEXT: s_branch .LBB2_138 +; GFX11W64-NEXT: .LBB2_137: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_138: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX11W64-NEXT: ; %bb.139: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX11W64-NEXT: s_branch .LBB2_141 +; GFX11W64-NEXT: .LBB2_140: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_141: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX11W64-NEXT: ; %bb.142: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX11W64-NEXT: s_branch .LBB2_144 +; GFX11W64-NEXT: .LBB2_143: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_144: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX11W64-NEXT: ; %bb.145: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX11W64-NEXT: s_branch .LBB2_147 +; GFX11W64-NEXT: .LBB2_146: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_147: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX11W64-NEXT: ; %bb.148: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX11W64-NEXT: s_branch .LBB2_150 +; GFX11W64-NEXT: .LBB2_149: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_150: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX11W64-NEXT: ; %bb.151: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX11W64-NEXT: s_branch .LBB2_153 +; GFX11W64-NEXT: .LBB2_152: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_153: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX11W64-NEXT: ; %bb.154: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX11W64-NEXT: s_branch .LBB2_156 +; GFX11W64-NEXT: .LBB2_155: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_156: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX11W64-NEXT: ; %bb.157: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX11W64-NEXT: s_branch .LBB2_159 +; GFX11W64-NEXT: .LBB2_158: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_159: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX11W64-NEXT: ; %bb.160: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX11W64-NEXT: s_branch .LBB2_162 +; GFX11W64-NEXT: .LBB2_161: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_162: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX11W64-NEXT: ; %bb.163: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX11W64-NEXT: s_branch .LBB2_165 +; GFX11W64-NEXT: .LBB2_164: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_165: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX11W64-NEXT: ; %bb.166: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX11W64-NEXT: s_branch .LBB2_168 +; GFX11W64-NEXT: .LBB2_167: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_168: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX11W64-NEXT: ; %bb.169: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX11W64-NEXT: s_branch .LBB2_171 +; GFX11W64-NEXT: .LBB2_170: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_171: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX11W64-NEXT: ; %bb.172: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX11W64-NEXT: s_branch .LBB2_174 +; GFX11W64-NEXT: .LBB2_173: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_174: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX11W64-NEXT: ; %bb.175: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX11W64-NEXT: s_branch .LBB2_177 +; GFX11W64-NEXT: .LBB2_176: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_177: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX11W64-NEXT: ; %bb.178: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX11W64-NEXT: s_branch .LBB2_180 +; GFX11W64-NEXT: .LBB2_179: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_180: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX11W64-NEXT: ; %bb.181: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX11W64-NEXT: s_branch .LBB2_183 +; GFX11W64-NEXT: .LBB2_182: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_183: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX11W64-NEXT: ; %bb.184: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX11W64-NEXT: s_branch .LBB2_186 +; GFX11W64-NEXT: .LBB2_185: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_186: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX11W64-NEXT: ; %bb.187: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX11W64-NEXT: s_branch .LBB2_189 +; GFX11W64-NEXT: .LBB2_188: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_189: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX11W64-NEXT: ; %bb.190: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX11W64-NEXT: s_branch .LBB2_192 +; GFX11W64-NEXT: .LBB2_191: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_192: +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_cbranch_execz .LBB2_194 +; GFX11W64-NEXT: ; %bb.193: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_add_i32 s4, s6, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB2_194: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 +; GFX11W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX11W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, off, s[4:7], 0 glc +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W32-NEXT: s_branch .LBB2_3 ; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_3: +; GFX11W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX11W32-NEXT: ; %bb.4: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX11W32-NEXT: s_branch .LBB2_6 +; GFX11W32-NEXT: .LBB2_5: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_6: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX11W32-NEXT: ; %bb.7: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX11W32-NEXT: s_branch .LBB2_9 +; GFX11W32-NEXT: .LBB2_8: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_9: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX11W32-NEXT: ; %bb.10: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX11W32-NEXT: s_branch .LBB2_12 +; GFX11W32-NEXT: .LBB2_11: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_12: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX11W32-NEXT: ; %bb.13: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX11W32-NEXT: s_branch .LBB2_15 +; GFX11W32-NEXT: .LBB2_14: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_15: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX11W32-NEXT: ; %bb.16: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX11W32-NEXT: s_branch .LBB2_18 +; GFX11W32-NEXT: .LBB2_17: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_18: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX11W32-NEXT: ; %bb.19: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX11W32-NEXT: s_branch .LBB2_21 +; GFX11W32-NEXT: .LBB2_20: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_21: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX11W32-NEXT: ; %bb.22: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX11W32-NEXT: s_branch .LBB2_24 +; GFX11W32-NEXT: .LBB2_23: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_24: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX11W32-NEXT: ; %bb.25: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX11W32-NEXT: s_branch .LBB2_27 +; GFX11W32-NEXT: .LBB2_26: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_27: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX11W32-NEXT: ; %bb.28: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX11W32-NEXT: s_branch .LBB2_30 +; GFX11W32-NEXT: .LBB2_29: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_30: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX11W32-NEXT: ; %bb.31: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX11W32-NEXT: s_branch .LBB2_33 +; GFX11W32-NEXT: .LBB2_32: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_33: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX11W32-NEXT: ; %bb.34: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX11W32-NEXT: s_branch .LBB2_36 +; GFX11W32-NEXT: .LBB2_35: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_36: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX11W32-NEXT: ; %bb.37: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX11W32-NEXT: s_branch .LBB2_39 +; GFX11W32-NEXT: .LBB2_38: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_39: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX11W32-NEXT: ; %bb.40: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX11W32-NEXT: s_branch .LBB2_42 +; GFX11W32-NEXT: .LBB2_41: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_42: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX11W32-NEXT: ; %bb.43: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX11W32-NEXT: s_branch .LBB2_45 +; GFX11W32-NEXT: .LBB2_44: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_45: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX11W32-NEXT: ; %bb.46: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX11W32-NEXT: s_branch .LBB2_48 +; GFX11W32-NEXT: .LBB2_47: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_48: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX11W32-NEXT: ; %bb.49: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX11W32-NEXT: s_branch .LBB2_51 +; GFX11W32-NEXT: .LBB2_50: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_51: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX11W32-NEXT: ; %bb.52: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX11W32-NEXT: s_branch .LBB2_54 +; GFX11W32-NEXT: .LBB2_53: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_54: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX11W32-NEXT: ; %bb.55: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX11W32-NEXT: s_branch .LBB2_57 +; GFX11W32-NEXT: .LBB2_56: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_57: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX11W32-NEXT: ; %bb.58: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX11W32-NEXT: s_branch .LBB2_60 +; GFX11W32-NEXT: .LBB2_59: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_60: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX11W32-NEXT: ; %bb.61: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX11W32-NEXT: s_branch .LBB2_63 +; GFX11W32-NEXT: .LBB2_62: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_63: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX11W32-NEXT: ; %bb.64: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX11W32-NEXT: s_branch .LBB2_66 +; GFX11W32-NEXT: .LBB2_65: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_66: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX11W32-NEXT: ; %bb.67: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX11W32-NEXT: s_branch .LBB2_69 +; GFX11W32-NEXT: .LBB2_68: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_69: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX11W32-NEXT: ; %bb.70: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX11W32-NEXT: s_branch .LBB2_72 +; GFX11W32-NEXT: .LBB2_71: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_72: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX11W32-NEXT: ; %bb.73: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX11W32-NEXT: s_branch .LBB2_75 +; GFX11W32-NEXT: .LBB2_74: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_75: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX11W32-NEXT: ; %bb.76: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX11W32-NEXT: s_branch .LBB2_78 +; GFX11W32-NEXT: .LBB2_77: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_78: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX11W32-NEXT: ; %bb.79: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX11W32-NEXT: s_branch .LBB2_81 +; GFX11W32-NEXT: .LBB2_80: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_81: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX11W32-NEXT: ; %bb.82: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX11W32-NEXT: s_branch .LBB2_84 +; GFX11W32-NEXT: .LBB2_83: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_84: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX11W32-NEXT: ; %bb.85: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX11W32-NEXT: s_branch .LBB2_87 +; GFX11W32-NEXT: .LBB2_86: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_87: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX11W32-NEXT: ; %bb.88: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX11W32-NEXT: s_branch .LBB2_90 +; GFX11W32-NEXT: .LBB2_89: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_90: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX11W32-NEXT: ; %bb.91: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX11W32-NEXT: s_branch .LBB2_93 +; GFX11W32-NEXT: .LBB2_92: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_93: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX11W32-NEXT: ; %bb.94: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX11W32-NEXT: s_branch .LBB2_96 +; GFX11W32-NEXT: .LBB2_95: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_96: +; GFX11W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB2_98 +; GFX11W32-NEXT: ; %bb.97: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB2_98: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1280,313 +5993,5027 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB6_3 +; GFX8-NEXT: .LBB6_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB6_6 +; GFX8-NEXT: .LBB6_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB6_9 +; GFX8-NEXT: .LBB6_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB6_12 +; GFX8-NEXT: .LBB6_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB6_15 +; GFX8-NEXT: .LBB6_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB6_18 +; GFX8-NEXT: .LBB6_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB6_21 +; GFX8-NEXT: .LBB6_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB6_24 +; GFX8-NEXT: .LBB6_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB6_27 +; GFX8-NEXT: .LBB6_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB6_30 +; GFX8-NEXT: .LBB6_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB6_33 +; GFX8-NEXT: .LBB6_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB6_36 +; GFX8-NEXT: .LBB6_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB6_39 +; GFX8-NEXT: .LBB6_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB6_42 +; GFX8-NEXT: .LBB6_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB6_45 +; GFX8-NEXT: .LBB6_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB6_48 +; GFX8-NEXT: .LBB6_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB6_51 +; GFX8-NEXT: .LBB6_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB6_54 +; GFX8-NEXT: .LBB6_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB6_57 +; GFX8-NEXT: .LBB6_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB6_60 +; GFX8-NEXT: .LBB6_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB6_63 +; GFX8-NEXT: .LBB6_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB6_66 +; GFX8-NEXT: .LBB6_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB6_69 +; GFX8-NEXT: .LBB6_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB6_72 +; GFX8-NEXT: .LBB6_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB6_75 +; GFX8-NEXT: .LBB6_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB6_78 +; GFX8-NEXT: .LBB6_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB6_81 +; GFX8-NEXT: .LBB6_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB6_84 +; GFX8-NEXT: .LBB6_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB6_87 +; GFX8-NEXT: .LBB6_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB6_90 +; GFX8-NEXT: .LBB6_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB6_93 +; GFX8-NEXT: .LBB6_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB6_96 +; GFX8-NEXT: .LBB6_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB6_99 +; GFX8-NEXT: .LBB6_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB6_102 +; GFX8-NEXT: .LBB6_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB6_105 +; GFX8-NEXT: .LBB6_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB6_108 +; GFX8-NEXT: .LBB6_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB6_111 +; GFX8-NEXT: .LBB6_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB6_114 +; GFX8-NEXT: .LBB6_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB6_117 +; GFX8-NEXT: .LBB6_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB6_120 +; GFX8-NEXT: .LBB6_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB6_123 +; GFX8-NEXT: .LBB6_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB6_126 +; GFX8-NEXT: .LBB6_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB6_129 +; GFX8-NEXT: .LBB6_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB6_132 +; GFX8-NEXT: .LBB6_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB6_135 +; GFX8-NEXT: .LBB6_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB6_138 +; GFX8-NEXT: .LBB6_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB6_141 +; GFX8-NEXT: .LBB6_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB6_144 +; GFX8-NEXT: .LBB6_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB6_147 +; GFX8-NEXT: .LBB6_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB6_150 +; GFX8-NEXT: .LBB6_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB6_153 +; GFX8-NEXT: .LBB6_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB6_156 +; GFX8-NEXT: .LBB6_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB6_159 +; GFX8-NEXT: .LBB6_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB6_162 +; GFX8-NEXT: .LBB6_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB6_165 +; GFX8-NEXT: .LBB6_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB6_168 +; GFX8-NEXT: .LBB6_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB6_171 +; GFX8-NEXT: .LBB6_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB6_174 +; GFX8-NEXT: .LBB6_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB6_177 +; GFX8-NEXT: .LBB6_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB6_180 +; GFX8-NEXT: .LBB6_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB6_183 +; GFX8-NEXT: .LBB6_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB6_186 +; GFX8-NEXT: .LBB6_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB6_189 +; GFX8-NEXT: .LBB6_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB6_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB6_192 +; GFX8-NEXT: .LBB6_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB6_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB6_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_cbranch_execz .LBB6_194 +; GFX8-NEXT: ; %bb.193: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX8-NEXT: .LBB6_2: +; GFX8-NEXT: .LBB6_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB6_3 +; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB6_6 +; GFX9-NEXT: .LBB6_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB6_9 +; GFX9-NEXT: .LBB6_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB6_12 +; GFX9-NEXT: .LBB6_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB6_15 +; GFX9-NEXT: .LBB6_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB6_18 +; GFX9-NEXT: .LBB6_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB6_21 +; GFX9-NEXT: .LBB6_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB6_24 +; GFX9-NEXT: .LBB6_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB6_27 +; GFX9-NEXT: .LBB6_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB6_30 +; GFX9-NEXT: .LBB6_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB6_33 +; GFX9-NEXT: .LBB6_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB6_36 +; GFX9-NEXT: .LBB6_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB6_39 +; GFX9-NEXT: .LBB6_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB6_42 +; GFX9-NEXT: .LBB6_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB6_45 +; GFX9-NEXT: .LBB6_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB6_48 +; GFX9-NEXT: .LBB6_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB6_51 +; GFX9-NEXT: .LBB6_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB6_54 +; GFX9-NEXT: .LBB6_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB6_57 +; GFX9-NEXT: .LBB6_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB6_60 +; GFX9-NEXT: .LBB6_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB6_63 +; GFX9-NEXT: .LBB6_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB6_66 +; GFX9-NEXT: .LBB6_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB6_69 +; GFX9-NEXT: .LBB6_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB6_72 +; GFX9-NEXT: .LBB6_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB6_75 +; GFX9-NEXT: .LBB6_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB6_78 +; GFX9-NEXT: .LBB6_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB6_81 +; GFX9-NEXT: .LBB6_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB6_84 +; GFX9-NEXT: .LBB6_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB6_87 +; GFX9-NEXT: .LBB6_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB6_90 +; GFX9-NEXT: .LBB6_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB6_93 +; GFX9-NEXT: .LBB6_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB6_96 +; GFX9-NEXT: .LBB6_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB6_99 +; GFX9-NEXT: .LBB6_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB6_102 +; GFX9-NEXT: .LBB6_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB6_105 +; GFX9-NEXT: .LBB6_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB6_108 +; GFX9-NEXT: .LBB6_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB6_111 +; GFX9-NEXT: .LBB6_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB6_114 +; GFX9-NEXT: .LBB6_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB6_117 +; GFX9-NEXT: .LBB6_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB6_120 +; GFX9-NEXT: .LBB6_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB6_123 +; GFX9-NEXT: .LBB6_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB6_126 +; GFX9-NEXT: .LBB6_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB6_129 +; GFX9-NEXT: .LBB6_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB6_132 +; GFX9-NEXT: .LBB6_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB6_135 +; GFX9-NEXT: .LBB6_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB6_138 +; GFX9-NEXT: .LBB6_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB6_141 +; GFX9-NEXT: .LBB6_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB6_144 +; GFX9-NEXT: .LBB6_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB6_147 +; GFX9-NEXT: .LBB6_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB6_150 +; GFX9-NEXT: .LBB6_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB6_153 +; GFX9-NEXT: .LBB6_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB6_156 +; GFX9-NEXT: .LBB6_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB6_159 +; GFX9-NEXT: .LBB6_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB6_162 +; GFX9-NEXT: .LBB6_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB6_165 +; GFX9-NEXT: .LBB6_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB6_168 +; GFX9-NEXT: .LBB6_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB6_171 +; GFX9-NEXT: .LBB6_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB6_174 +; GFX9-NEXT: .LBB6_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB6_177 +; GFX9-NEXT: .LBB6_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB6_180 +; GFX9-NEXT: .LBB6_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB6_183 +; GFX9-NEXT: .LBB6_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB6_186 +; GFX9-NEXT: .LBB6_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB6_189 +; GFX9-NEXT: .LBB6_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB6_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB6_192 +; GFX9-NEXT: .LBB6_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB6_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB6_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_cbranch_execz .LBB6_194 +; GFX9-NEXT: ; %bb.193: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc -; GFX9-NEXT: .LBB6_2: +; GFX9-NEXT: .LBB6_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX10W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W64-NEXT: s_branch .LBB6_3 +; GFX10W64-NEXT: .LBB6_2: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_3: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_5 +; GFX10W64-NEXT: ; %bb.4: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX10W64-NEXT: s_branch .LBB6_6 +; GFX10W64-NEXT: .LBB6_5: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_6: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_8 +; GFX10W64-NEXT: ; %bb.7: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX10W64-NEXT: s_branch .LBB6_9 +; GFX10W64-NEXT: .LBB6_8: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_9: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_11 +; GFX10W64-NEXT: ; %bb.10: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX10W64-NEXT: s_branch .LBB6_12 +; GFX10W64-NEXT: .LBB6_11: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_12: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_14 +; GFX10W64-NEXT: ; %bb.13: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX10W64-NEXT: s_branch .LBB6_15 +; GFX10W64-NEXT: .LBB6_14: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_15: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_17 +; GFX10W64-NEXT: ; %bb.16: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX10W64-NEXT: s_branch .LBB6_18 +; GFX10W64-NEXT: .LBB6_17: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_18: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_20 +; GFX10W64-NEXT: ; %bb.19: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX10W64-NEXT: s_branch .LBB6_21 +; GFX10W64-NEXT: .LBB6_20: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_21: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_23 +; GFX10W64-NEXT: ; %bb.22: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX10W64-NEXT: s_branch .LBB6_24 +; GFX10W64-NEXT: .LBB6_23: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_24: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_26 +; GFX10W64-NEXT: ; %bb.25: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX10W64-NEXT: s_branch .LBB6_27 +; GFX10W64-NEXT: .LBB6_26: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_27: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_29 +; GFX10W64-NEXT: ; %bb.28: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX10W64-NEXT: s_branch .LBB6_30 +; GFX10W64-NEXT: .LBB6_29: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_30: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_32 +; GFX10W64-NEXT: ; %bb.31: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX10W64-NEXT: s_branch .LBB6_33 +; GFX10W64-NEXT: .LBB6_32: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_33: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_35 +; GFX10W64-NEXT: ; %bb.34: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX10W64-NEXT: s_branch .LBB6_36 +; GFX10W64-NEXT: .LBB6_35: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_36: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_38 +; GFX10W64-NEXT: ; %bb.37: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX10W64-NEXT: s_branch .LBB6_39 +; GFX10W64-NEXT: .LBB6_38: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_39: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_41 +; GFX10W64-NEXT: ; %bb.40: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX10W64-NEXT: s_branch .LBB6_42 +; GFX10W64-NEXT: .LBB6_41: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_42: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_44 +; GFX10W64-NEXT: ; %bb.43: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX10W64-NEXT: s_branch .LBB6_45 +; GFX10W64-NEXT: .LBB6_44: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_45: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_47 +; GFX10W64-NEXT: ; %bb.46: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX10W64-NEXT: s_branch .LBB6_48 +; GFX10W64-NEXT: .LBB6_47: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_48: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_50 +; GFX10W64-NEXT: ; %bb.49: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX10W64-NEXT: s_branch .LBB6_51 +; GFX10W64-NEXT: .LBB6_50: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_51: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_53 +; GFX10W64-NEXT: ; %bb.52: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX10W64-NEXT: s_branch .LBB6_54 +; GFX10W64-NEXT: .LBB6_53: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_54: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_56 +; GFX10W64-NEXT: ; %bb.55: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX10W64-NEXT: s_branch .LBB6_57 +; GFX10W64-NEXT: .LBB6_56: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_57: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_59 +; GFX10W64-NEXT: ; %bb.58: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX10W64-NEXT: s_branch .LBB6_60 +; GFX10W64-NEXT: .LBB6_59: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_60: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_62 +; GFX10W64-NEXT: ; %bb.61: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX10W64-NEXT: s_branch .LBB6_63 +; GFX10W64-NEXT: .LBB6_62: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_63: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_65 +; GFX10W64-NEXT: ; %bb.64: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX10W64-NEXT: s_branch .LBB6_66 +; GFX10W64-NEXT: .LBB6_65: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_66: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_68 +; GFX10W64-NEXT: ; %bb.67: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX10W64-NEXT: s_branch .LBB6_69 +; GFX10W64-NEXT: .LBB6_68: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_69: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_71 +; GFX10W64-NEXT: ; %bb.70: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX10W64-NEXT: s_branch .LBB6_72 +; GFX10W64-NEXT: .LBB6_71: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_72: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_74 +; GFX10W64-NEXT: ; %bb.73: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX10W64-NEXT: s_branch .LBB6_75 +; GFX10W64-NEXT: .LBB6_74: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_75: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_77 +; GFX10W64-NEXT: ; %bb.76: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX10W64-NEXT: s_branch .LBB6_78 +; GFX10W64-NEXT: .LBB6_77: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_78: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_80 +; GFX10W64-NEXT: ; %bb.79: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX10W64-NEXT: s_branch .LBB6_81 +; GFX10W64-NEXT: .LBB6_80: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_81: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_83 +; GFX10W64-NEXT: ; %bb.82: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX10W64-NEXT: s_branch .LBB6_84 +; GFX10W64-NEXT: .LBB6_83: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_84: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_86 +; GFX10W64-NEXT: ; %bb.85: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX10W64-NEXT: s_branch .LBB6_87 +; GFX10W64-NEXT: .LBB6_86: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_87: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_89 +; GFX10W64-NEXT: ; %bb.88: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX10W64-NEXT: s_branch .LBB6_90 +; GFX10W64-NEXT: .LBB6_89: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_90: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX10W64-NEXT: s_add_i32 s4, s6, s2 +; GFX10W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX10W64-NEXT: s_mov_b32 s7, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_92 +; GFX10W64-NEXT: ; %bb.91: +; GFX10W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX10W64-NEXT: s_branch .LBB6_93 +; GFX10W64-NEXT: .LBB6_92: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_93: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s5, 0 +; GFX10W64-NEXT: s_add_i32 s6, s4, s2 +; GFX10W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_95 +; GFX10W64-NEXT: ; %bb.94: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX10W64-NEXT: s_branch .LBB6_96 +; GFX10W64-NEXT: .LBB6_95: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_96: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_98 +; GFX10W64-NEXT: ; %bb.97: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX10W64-NEXT: s_branch .LBB6_99 +; GFX10W64-NEXT: .LBB6_98: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_99: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_101 +; GFX10W64-NEXT: ; %bb.100: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX10W64-NEXT: s_branch .LBB6_102 +; GFX10W64-NEXT: .LBB6_101: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_102: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_104 +; GFX10W64-NEXT: ; %bb.103: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX10W64-NEXT: s_branch .LBB6_105 +; GFX10W64-NEXT: .LBB6_104: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_105: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_107 +; GFX10W64-NEXT: ; %bb.106: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX10W64-NEXT: s_branch .LBB6_108 +; GFX10W64-NEXT: .LBB6_107: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_108: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_110 +; GFX10W64-NEXT: ; %bb.109: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX10W64-NEXT: s_branch .LBB6_111 +; GFX10W64-NEXT: .LBB6_110: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_111: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_113 +; GFX10W64-NEXT: ; %bb.112: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX10W64-NEXT: s_branch .LBB6_114 +; GFX10W64-NEXT: .LBB6_113: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_114: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_116 +; GFX10W64-NEXT: ; %bb.115: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX10W64-NEXT: s_branch .LBB6_117 +; GFX10W64-NEXT: .LBB6_116: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_117: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_119 +; GFX10W64-NEXT: ; %bb.118: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX10W64-NEXT: s_branch .LBB6_120 +; GFX10W64-NEXT: .LBB6_119: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_120: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_122 +; GFX10W64-NEXT: ; %bb.121: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX10W64-NEXT: s_branch .LBB6_123 +; GFX10W64-NEXT: .LBB6_122: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_123: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_125 +; GFX10W64-NEXT: ; %bb.124: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX10W64-NEXT: s_branch .LBB6_126 +; GFX10W64-NEXT: .LBB6_125: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_126: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_128 +; GFX10W64-NEXT: ; %bb.127: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX10W64-NEXT: s_branch .LBB6_129 +; GFX10W64-NEXT: .LBB6_128: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_129: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_131 +; GFX10W64-NEXT: ; %bb.130: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX10W64-NEXT: s_branch .LBB6_132 +; GFX10W64-NEXT: .LBB6_131: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_132: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_134 +; GFX10W64-NEXT: ; %bb.133: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX10W64-NEXT: s_branch .LBB6_135 +; GFX10W64-NEXT: .LBB6_134: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_135: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_137 +; GFX10W64-NEXT: ; %bb.136: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX10W64-NEXT: s_branch .LBB6_138 +; GFX10W64-NEXT: .LBB6_137: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_138: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_140 +; GFX10W64-NEXT: ; %bb.139: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX10W64-NEXT: s_branch .LBB6_141 +; GFX10W64-NEXT: .LBB6_140: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_141: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_143 +; GFX10W64-NEXT: ; %bb.142: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX10W64-NEXT: s_branch .LBB6_144 +; GFX10W64-NEXT: .LBB6_143: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_144: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_146 +; GFX10W64-NEXT: ; %bb.145: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX10W64-NEXT: s_branch .LBB6_147 +; GFX10W64-NEXT: .LBB6_146: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_147: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_149 +; GFX10W64-NEXT: ; %bb.148: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX10W64-NEXT: s_branch .LBB6_150 +; GFX10W64-NEXT: .LBB6_149: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_150: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_152 +; GFX10W64-NEXT: ; %bb.151: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX10W64-NEXT: s_branch .LBB6_153 +; GFX10W64-NEXT: .LBB6_152: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_153: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_155 +; GFX10W64-NEXT: ; %bb.154: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX10W64-NEXT: s_branch .LBB6_156 +; GFX10W64-NEXT: .LBB6_155: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_156: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_158 +; GFX10W64-NEXT: ; %bb.157: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX10W64-NEXT: s_branch .LBB6_159 +; GFX10W64-NEXT: .LBB6_158: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_159: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_161 +; GFX10W64-NEXT: ; %bb.160: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX10W64-NEXT: s_branch .LBB6_162 +; GFX10W64-NEXT: .LBB6_161: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_162: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_164 +; GFX10W64-NEXT: ; %bb.163: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX10W64-NEXT: s_branch .LBB6_165 +; GFX10W64-NEXT: .LBB6_164: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_165: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_167 +; GFX10W64-NEXT: ; %bb.166: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX10W64-NEXT: s_branch .LBB6_168 +; GFX10W64-NEXT: .LBB6_167: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_168: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_170 +; GFX10W64-NEXT: ; %bb.169: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX10W64-NEXT: s_branch .LBB6_171 +; GFX10W64-NEXT: .LBB6_170: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_171: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_173 +; GFX10W64-NEXT: ; %bb.172: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX10W64-NEXT: s_branch .LBB6_174 +; GFX10W64-NEXT: .LBB6_173: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_174: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_176 +; GFX10W64-NEXT: ; %bb.175: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX10W64-NEXT: s_branch .LBB6_177 +; GFX10W64-NEXT: .LBB6_176: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_177: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_179 +; GFX10W64-NEXT: ; %bb.178: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX10W64-NEXT: s_branch .LBB6_180 +; GFX10W64-NEXT: .LBB6_179: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_180: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_182 +; GFX10W64-NEXT: ; %bb.181: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX10W64-NEXT: s_branch .LBB6_183 +; GFX10W64-NEXT: .LBB6_182: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_183: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_185 +; GFX10W64-NEXT: ; %bb.184: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX10W64-NEXT: s_branch .LBB6_186 +; GFX10W64-NEXT: .LBB6_185: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_186: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_188 +; GFX10W64-NEXT: ; %bb.187: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX10W64-NEXT: s_branch .LBB6_189 +; GFX10W64-NEXT: .LBB6_188: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_189: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_191 +; GFX10W64-NEXT: ; %bb.190: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX10W64-NEXT: s_branch .LBB6_192 +; GFX10W64-NEXT: .LBB6_191: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB6_192: +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB6_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_cbranch_execz .LBB6_194 +; GFX10W64-NEXT: ; %bb.193: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_add_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, off, s[8:11], 0 glc -; GFX10W64-NEXT: .LBB6_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W64-NEXT: .LBB6_194: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX10W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX10W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, off, s[4:7], 0 glc +; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W32-NEXT: s_branch .LBB6_3 ; GFX10W32-NEXT: .LBB6_2: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_3: +; GFX10W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_5 +; GFX10W32-NEXT: ; %bb.4: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX10W32-NEXT: s_branch .LBB6_6 +; GFX10W32-NEXT: .LBB6_5: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_6: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_8 +; GFX10W32-NEXT: ; %bb.7: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX10W32-NEXT: s_branch .LBB6_9 +; GFX10W32-NEXT: .LBB6_8: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_9: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_11 +; GFX10W32-NEXT: ; %bb.10: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX10W32-NEXT: s_branch .LBB6_12 +; GFX10W32-NEXT: .LBB6_11: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_12: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_14 +; GFX10W32-NEXT: ; %bb.13: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX10W32-NEXT: s_branch .LBB6_15 +; GFX10W32-NEXT: .LBB6_14: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_15: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_17 +; GFX10W32-NEXT: ; %bb.16: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX10W32-NEXT: s_branch .LBB6_18 +; GFX10W32-NEXT: .LBB6_17: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_18: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_20 +; GFX10W32-NEXT: ; %bb.19: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX10W32-NEXT: s_branch .LBB6_21 +; GFX10W32-NEXT: .LBB6_20: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_21: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_23 +; GFX10W32-NEXT: ; %bb.22: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX10W32-NEXT: s_branch .LBB6_24 +; GFX10W32-NEXT: .LBB6_23: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_24: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_26 +; GFX10W32-NEXT: ; %bb.25: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX10W32-NEXT: s_branch .LBB6_27 +; GFX10W32-NEXT: .LBB6_26: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_27: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_29 +; GFX10W32-NEXT: ; %bb.28: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX10W32-NEXT: s_branch .LBB6_30 +; GFX10W32-NEXT: .LBB6_29: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_30: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_32 +; GFX10W32-NEXT: ; %bb.31: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX10W32-NEXT: s_branch .LBB6_33 +; GFX10W32-NEXT: .LBB6_32: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_33: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_35 +; GFX10W32-NEXT: ; %bb.34: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX10W32-NEXT: s_branch .LBB6_36 +; GFX10W32-NEXT: .LBB6_35: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_36: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_38 +; GFX10W32-NEXT: ; %bb.37: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX10W32-NEXT: s_branch .LBB6_39 +; GFX10W32-NEXT: .LBB6_38: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_39: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_41 +; GFX10W32-NEXT: ; %bb.40: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX10W32-NEXT: s_branch .LBB6_42 +; GFX10W32-NEXT: .LBB6_41: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_42: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_44 +; GFX10W32-NEXT: ; %bb.43: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX10W32-NEXT: s_branch .LBB6_45 +; GFX10W32-NEXT: .LBB6_44: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_45: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_47 +; GFX10W32-NEXT: ; %bb.46: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX10W32-NEXT: s_branch .LBB6_48 +; GFX10W32-NEXT: .LBB6_47: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_48: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_50 +; GFX10W32-NEXT: ; %bb.49: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX10W32-NEXT: s_branch .LBB6_51 +; GFX10W32-NEXT: .LBB6_50: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_51: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_53 +; GFX10W32-NEXT: ; %bb.52: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX10W32-NEXT: s_branch .LBB6_54 +; GFX10W32-NEXT: .LBB6_53: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_54: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_56 +; GFX10W32-NEXT: ; %bb.55: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX10W32-NEXT: s_branch .LBB6_57 +; GFX10W32-NEXT: .LBB6_56: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_57: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_59 +; GFX10W32-NEXT: ; %bb.58: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX10W32-NEXT: s_branch .LBB6_60 +; GFX10W32-NEXT: .LBB6_59: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_60: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_62 +; GFX10W32-NEXT: ; %bb.61: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX10W32-NEXT: s_branch .LBB6_63 +; GFX10W32-NEXT: .LBB6_62: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_63: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_65 +; GFX10W32-NEXT: ; %bb.64: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX10W32-NEXT: s_branch .LBB6_66 +; GFX10W32-NEXT: .LBB6_65: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_66: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_68 +; GFX10W32-NEXT: ; %bb.67: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX10W32-NEXT: s_branch .LBB6_69 +; GFX10W32-NEXT: .LBB6_68: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_69: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_71 +; GFX10W32-NEXT: ; %bb.70: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX10W32-NEXT: s_branch .LBB6_72 +; GFX10W32-NEXT: .LBB6_71: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_72: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_74 +; GFX10W32-NEXT: ; %bb.73: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX10W32-NEXT: s_branch .LBB6_75 +; GFX10W32-NEXT: .LBB6_74: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_75: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_77 +; GFX10W32-NEXT: ; %bb.76: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX10W32-NEXT: s_branch .LBB6_78 +; GFX10W32-NEXT: .LBB6_77: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_78: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_80 +; GFX10W32-NEXT: ; %bb.79: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX10W32-NEXT: s_branch .LBB6_81 +; GFX10W32-NEXT: .LBB6_80: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_81: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_83 +; GFX10W32-NEXT: ; %bb.82: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX10W32-NEXT: s_branch .LBB6_84 +; GFX10W32-NEXT: .LBB6_83: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_84: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_86 +; GFX10W32-NEXT: ; %bb.85: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX10W32-NEXT: s_branch .LBB6_87 +; GFX10W32-NEXT: .LBB6_86: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_87: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_89 +; GFX10W32-NEXT: ; %bb.88: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX10W32-NEXT: s_branch .LBB6_90 +; GFX10W32-NEXT: .LBB6_89: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_90: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_92 +; GFX10W32-NEXT: ; %bb.91: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX10W32-NEXT: s_branch .LBB6_93 +; GFX10W32-NEXT: .LBB6_92: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_93: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_95 +; GFX10W32-NEXT: ; %bb.94: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX10W32-NEXT: s_branch .LBB6_96 +; GFX10W32-NEXT: .LBB6_95: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB6_96: +; GFX10W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_cbranch_execz .LBB6_98 +; GFX10W32-NEXT: ; %bb.97: +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W32-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX10W32-NEXT: .LBB6_98: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX11W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W64-NEXT: s_branch .LBB6_3 +; GFX11W64-NEXT: .LBB6_2: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_3: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_5 +; GFX11W64-NEXT: ; %bb.4: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX11W64-NEXT: s_branch .LBB6_6 +; GFX11W64-NEXT: .LBB6_5: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_6: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_8 +; GFX11W64-NEXT: ; %bb.7: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX11W64-NEXT: s_branch .LBB6_9 +; GFX11W64-NEXT: .LBB6_8: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_9: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_11 +; GFX11W64-NEXT: ; %bb.10: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX11W64-NEXT: s_branch .LBB6_12 +; GFX11W64-NEXT: .LBB6_11: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_12: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_14 +; GFX11W64-NEXT: ; %bb.13: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX11W64-NEXT: s_branch .LBB6_15 +; GFX11W64-NEXT: .LBB6_14: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_15: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_17 +; GFX11W64-NEXT: ; %bb.16: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX11W64-NEXT: s_branch .LBB6_18 +; GFX11W64-NEXT: .LBB6_17: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_18: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_20 +; GFX11W64-NEXT: ; %bb.19: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX11W64-NEXT: s_branch .LBB6_21 +; GFX11W64-NEXT: .LBB6_20: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_21: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_23 +; GFX11W64-NEXT: ; %bb.22: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX11W64-NEXT: s_branch .LBB6_24 +; GFX11W64-NEXT: .LBB6_23: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_24: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_26 +; GFX11W64-NEXT: ; %bb.25: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX11W64-NEXT: s_branch .LBB6_27 +; GFX11W64-NEXT: .LBB6_26: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_27: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_29 +; GFX11W64-NEXT: ; %bb.28: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX11W64-NEXT: s_branch .LBB6_30 +; GFX11W64-NEXT: .LBB6_29: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_30: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_32 +; GFX11W64-NEXT: ; %bb.31: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX11W64-NEXT: s_branch .LBB6_33 +; GFX11W64-NEXT: .LBB6_32: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_33: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_35 +; GFX11W64-NEXT: ; %bb.34: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX11W64-NEXT: s_branch .LBB6_36 +; GFX11W64-NEXT: .LBB6_35: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_36: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_38 +; GFX11W64-NEXT: ; %bb.37: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX11W64-NEXT: s_branch .LBB6_39 +; GFX11W64-NEXT: .LBB6_38: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_39: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_41 +; GFX11W64-NEXT: ; %bb.40: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX11W64-NEXT: s_branch .LBB6_42 +; GFX11W64-NEXT: .LBB6_41: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_42: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_44 +; GFX11W64-NEXT: ; %bb.43: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX11W64-NEXT: s_branch .LBB6_45 +; GFX11W64-NEXT: .LBB6_44: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_45: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_47 +; GFX11W64-NEXT: ; %bb.46: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX11W64-NEXT: s_branch .LBB6_48 +; GFX11W64-NEXT: .LBB6_47: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_48: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_50 +; GFX11W64-NEXT: ; %bb.49: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX11W64-NEXT: s_branch .LBB6_51 +; GFX11W64-NEXT: .LBB6_50: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_51: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_53 +; GFX11W64-NEXT: ; %bb.52: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX11W64-NEXT: s_branch .LBB6_54 +; GFX11W64-NEXT: .LBB6_53: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_54: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_56 +; GFX11W64-NEXT: ; %bb.55: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX11W64-NEXT: s_branch .LBB6_57 +; GFX11W64-NEXT: .LBB6_56: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_57: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_59 +; GFX11W64-NEXT: ; %bb.58: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX11W64-NEXT: s_branch .LBB6_60 +; GFX11W64-NEXT: .LBB6_59: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_60: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_62 +; GFX11W64-NEXT: ; %bb.61: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX11W64-NEXT: s_branch .LBB6_63 +; GFX11W64-NEXT: .LBB6_62: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_63: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_65 +; GFX11W64-NEXT: ; %bb.64: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX11W64-NEXT: s_branch .LBB6_66 +; GFX11W64-NEXT: .LBB6_65: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_66: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_68 +; GFX11W64-NEXT: ; %bb.67: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX11W64-NEXT: s_branch .LBB6_69 +; GFX11W64-NEXT: .LBB6_68: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_69: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_71 +; GFX11W64-NEXT: ; %bb.70: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX11W64-NEXT: s_branch .LBB6_72 +; GFX11W64-NEXT: .LBB6_71: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_72: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_74 +; GFX11W64-NEXT: ; %bb.73: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX11W64-NEXT: s_branch .LBB6_75 +; GFX11W64-NEXT: .LBB6_74: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_75: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_77 +; GFX11W64-NEXT: ; %bb.76: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX11W64-NEXT: s_branch .LBB6_78 +; GFX11W64-NEXT: .LBB6_77: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_78: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_80 +; GFX11W64-NEXT: ; %bb.79: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX11W64-NEXT: s_branch .LBB6_81 +; GFX11W64-NEXT: .LBB6_80: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_81: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_83 +; GFX11W64-NEXT: ; %bb.82: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX11W64-NEXT: s_branch .LBB6_84 +; GFX11W64-NEXT: .LBB6_83: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_84: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_86 +; GFX11W64-NEXT: ; %bb.85: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX11W64-NEXT: s_branch .LBB6_87 +; GFX11W64-NEXT: .LBB6_86: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_87: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_89 +; GFX11W64-NEXT: ; %bb.88: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX11W64-NEXT: s_branch .LBB6_90 +; GFX11W64-NEXT: .LBB6_89: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_90: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX11W64-NEXT: s_add_i32 s4, s6, s2 +; GFX11W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX11W64-NEXT: s_mov_b32 s7, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_92 +; GFX11W64-NEXT: ; %bb.91: +; GFX11W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX11W64-NEXT: s_branch .LBB6_93 +; GFX11W64-NEXT: .LBB6_92: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_93: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s5, 0 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: s_add_i32 s6, s4, s2 +; GFX11W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_95 +; GFX11W64-NEXT: ; %bb.94: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX11W64-NEXT: s_branch .LBB6_96 +; GFX11W64-NEXT: .LBB6_95: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_96: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_98 +; GFX11W64-NEXT: ; %bb.97: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX11W64-NEXT: s_branch .LBB6_99 +; GFX11W64-NEXT: .LBB6_98: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_99: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_101 +; GFX11W64-NEXT: ; %bb.100: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX11W64-NEXT: s_branch .LBB6_102 +; GFX11W64-NEXT: .LBB6_101: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_102: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_104 +; GFX11W64-NEXT: ; %bb.103: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX11W64-NEXT: s_branch .LBB6_105 +; GFX11W64-NEXT: .LBB6_104: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_105: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_107 +; GFX11W64-NEXT: ; %bb.106: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX11W64-NEXT: s_branch .LBB6_108 +; GFX11W64-NEXT: .LBB6_107: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_108: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_110 +; GFX11W64-NEXT: ; %bb.109: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX11W64-NEXT: s_branch .LBB6_111 +; GFX11W64-NEXT: .LBB6_110: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_111: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_113 +; GFX11W64-NEXT: ; %bb.112: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX11W64-NEXT: s_branch .LBB6_114 +; GFX11W64-NEXT: .LBB6_113: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_114: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_116 +; GFX11W64-NEXT: ; %bb.115: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX11W64-NEXT: s_branch .LBB6_117 +; GFX11W64-NEXT: .LBB6_116: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_117: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_119 +; GFX11W64-NEXT: ; %bb.118: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX11W64-NEXT: s_branch .LBB6_120 +; GFX11W64-NEXT: .LBB6_119: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_120: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_122 +; GFX11W64-NEXT: ; %bb.121: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX11W64-NEXT: s_branch .LBB6_123 +; GFX11W64-NEXT: .LBB6_122: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_123: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_125 +; GFX11W64-NEXT: ; %bb.124: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX11W64-NEXT: s_branch .LBB6_126 +; GFX11W64-NEXT: .LBB6_125: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_126: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_128 +; GFX11W64-NEXT: ; %bb.127: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX11W64-NEXT: s_branch .LBB6_129 +; GFX11W64-NEXT: .LBB6_128: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_129: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_131 +; GFX11W64-NEXT: ; %bb.130: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX11W64-NEXT: s_branch .LBB6_132 +; GFX11W64-NEXT: .LBB6_131: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_132: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_134 +; GFX11W64-NEXT: ; %bb.133: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX11W64-NEXT: s_branch .LBB6_135 +; GFX11W64-NEXT: .LBB6_134: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_135: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_137 +; GFX11W64-NEXT: ; %bb.136: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX11W64-NEXT: s_branch .LBB6_138 +; GFX11W64-NEXT: .LBB6_137: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_138: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_140 +; GFX11W64-NEXT: ; %bb.139: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX11W64-NEXT: s_branch .LBB6_141 +; GFX11W64-NEXT: .LBB6_140: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_141: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_143 +; GFX11W64-NEXT: ; %bb.142: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX11W64-NEXT: s_branch .LBB6_144 +; GFX11W64-NEXT: .LBB6_143: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_144: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_146 +; GFX11W64-NEXT: ; %bb.145: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX11W64-NEXT: s_branch .LBB6_147 +; GFX11W64-NEXT: .LBB6_146: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_147: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_149 +; GFX11W64-NEXT: ; %bb.148: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX11W64-NEXT: s_branch .LBB6_150 +; GFX11W64-NEXT: .LBB6_149: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_150: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_152 +; GFX11W64-NEXT: ; %bb.151: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX11W64-NEXT: s_branch .LBB6_153 +; GFX11W64-NEXT: .LBB6_152: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_153: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_155 +; GFX11W64-NEXT: ; %bb.154: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX11W64-NEXT: s_branch .LBB6_156 +; GFX11W64-NEXT: .LBB6_155: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_156: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_158 +; GFX11W64-NEXT: ; %bb.157: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX11W64-NEXT: s_branch .LBB6_159 +; GFX11W64-NEXT: .LBB6_158: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_159: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_161 +; GFX11W64-NEXT: ; %bb.160: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX11W64-NEXT: s_branch .LBB6_162 +; GFX11W64-NEXT: .LBB6_161: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_162: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_164 +; GFX11W64-NEXT: ; %bb.163: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX11W64-NEXT: s_branch .LBB6_165 +; GFX11W64-NEXT: .LBB6_164: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_165: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_167 +; GFX11W64-NEXT: ; %bb.166: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX11W64-NEXT: s_branch .LBB6_168 +; GFX11W64-NEXT: .LBB6_167: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_168: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_170 +; GFX11W64-NEXT: ; %bb.169: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX11W64-NEXT: s_branch .LBB6_171 +; GFX11W64-NEXT: .LBB6_170: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_171: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_173 +; GFX11W64-NEXT: ; %bb.172: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX11W64-NEXT: s_branch .LBB6_174 +; GFX11W64-NEXT: .LBB6_173: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_174: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_176 +; GFX11W64-NEXT: ; %bb.175: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX11W64-NEXT: s_branch .LBB6_177 +; GFX11W64-NEXT: .LBB6_176: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_177: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_179 +; GFX11W64-NEXT: ; %bb.178: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX11W64-NEXT: s_branch .LBB6_180 +; GFX11W64-NEXT: .LBB6_179: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_180: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_182 +; GFX11W64-NEXT: ; %bb.181: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX11W64-NEXT: s_branch .LBB6_183 +; GFX11W64-NEXT: .LBB6_182: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_183: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_185 +; GFX11W64-NEXT: ; %bb.184: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX11W64-NEXT: s_branch .LBB6_186 +; GFX11W64-NEXT: .LBB6_185: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_186: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_188 +; GFX11W64-NEXT: ; %bb.187: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX11W64-NEXT: s_branch .LBB6_189 +; GFX11W64-NEXT: .LBB6_188: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_189: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_191 +; GFX11W64-NEXT: ; %bb.190: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX11W64-NEXT: s_branch .LBB6_192 +; GFX11W64-NEXT: .LBB6_191: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB6_192: +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB6_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_cbranch_execz .LBB6_194 +; GFX11W64-NEXT: ; %bb.193: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W64-NEXT: s_add_i32 s4, s6, s4 +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, off, s[8:11], 0 glc -; GFX11W64-NEXT: .LBB6_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W64-NEXT: .LBB6_194: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB6_2 +; GFX11W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX11W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_2 ; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, off, s[4:7], 0 glc +; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W32-NEXT: s_branch .LBB6_3 ; GFX11W32-NEXT: .LBB6_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_3: +; GFX11W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_5 +; GFX11W32-NEXT: ; %bb.4: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX11W32-NEXT: s_branch .LBB6_6 +; GFX11W32-NEXT: .LBB6_5: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_6: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_8 +; GFX11W32-NEXT: ; %bb.7: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX11W32-NEXT: s_branch .LBB6_9 +; GFX11W32-NEXT: .LBB6_8: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_9: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_11 +; GFX11W32-NEXT: ; %bb.10: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX11W32-NEXT: s_branch .LBB6_12 +; GFX11W32-NEXT: .LBB6_11: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_12: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_14 +; GFX11W32-NEXT: ; %bb.13: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX11W32-NEXT: s_branch .LBB6_15 +; GFX11W32-NEXT: .LBB6_14: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_15: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_17 +; GFX11W32-NEXT: ; %bb.16: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX11W32-NEXT: s_branch .LBB6_18 +; GFX11W32-NEXT: .LBB6_17: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_18: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_20 +; GFX11W32-NEXT: ; %bb.19: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX11W32-NEXT: s_branch .LBB6_21 +; GFX11W32-NEXT: .LBB6_20: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_21: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_23 +; GFX11W32-NEXT: ; %bb.22: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX11W32-NEXT: s_branch .LBB6_24 +; GFX11W32-NEXT: .LBB6_23: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_24: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_26 +; GFX11W32-NEXT: ; %bb.25: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX11W32-NEXT: s_branch .LBB6_27 +; GFX11W32-NEXT: .LBB6_26: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_27: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_29 +; GFX11W32-NEXT: ; %bb.28: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX11W32-NEXT: s_branch .LBB6_30 +; GFX11W32-NEXT: .LBB6_29: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_30: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_32 +; GFX11W32-NEXT: ; %bb.31: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX11W32-NEXT: s_branch .LBB6_33 +; GFX11W32-NEXT: .LBB6_32: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_33: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_35 +; GFX11W32-NEXT: ; %bb.34: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX11W32-NEXT: s_branch .LBB6_36 +; GFX11W32-NEXT: .LBB6_35: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_36: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_38 +; GFX11W32-NEXT: ; %bb.37: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX11W32-NEXT: s_branch .LBB6_39 +; GFX11W32-NEXT: .LBB6_38: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_39: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_41 +; GFX11W32-NEXT: ; %bb.40: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX11W32-NEXT: s_branch .LBB6_42 +; GFX11W32-NEXT: .LBB6_41: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_42: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_44 +; GFX11W32-NEXT: ; %bb.43: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX11W32-NEXT: s_branch .LBB6_45 +; GFX11W32-NEXT: .LBB6_44: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_45: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_47 +; GFX11W32-NEXT: ; %bb.46: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX11W32-NEXT: s_branch .LBB6_48 +; GFX11W32-NEXT: .LBB6_47: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_48: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_50 +; GFX11W32-NEXT: ; %bb.49: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX11W32-NEXT: s_branch .LBB6_51 +; GFX11W32-NEXT: .LBB6_50: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_51: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_53 +; GFX11W32-NEXT: ; %bb.52: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX11W32-NEXT: s_branch .LBB6_54 +; GFX11W32-NEXT: .LBB6_53: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_54: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_56 +; GFX11W32-NEXT: ; %bb.55: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX11W32-NEXT: s_branch .LBB6_57 +; GFX11W32-NEXT: .LBB6_56: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_57: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_59 +; GFX11W32-NEXT: ; %bb.58: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX11W32-NEXT: s_branch .LBB6_60 +; GFX11W32-NEXT: .LBB6_59: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_60: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_62 +; GFX11W32-NEXT: ; %bb.61: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX11W32-NEXT: s_branch .LBB6_63 +; GFX11W32-NEXT: .LBB6_62: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_63: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_65 +; GFX11W32-NEXT: ; %bb.64: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX11W32-NEXT: s_branch .LBB6_66 +; GFX11W32-NEXT: .LBB6_65: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_66: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_68 +; GFX11W32-NEXT: ; %bb.67: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX11W32-NEXT: s_branch .LBB6_69 +; GFX11W32-NEXT: .LBB6_68: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_69: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_71 +; GFX11W32-NEXT: ; %bb.70: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX11W32-NEXT: s_branch .LBB6_72 +; GFX11W32-NEXT: .LBB6_71: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_72: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_74 +; GFX11W32-NEXT: ; %bb.73: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX11W32-NEXT: s_branch .LBB6_75 +; GFX11W32-NEXT: .LBB6_74: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_75: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_77 +; GFX11W32-NEXT: ; %bb.76: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX11W32-NEXT: s_branch .LBB6_78 +; GFX11W32-NEXT: .LBB6_77: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_78: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_80 +; GFX11W32-NEXT: ; %bb.79: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX11W32-NEXT: s_branch .LBB6_81 +; GFX11W32-NEXT: .LBB6_80: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_81: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_83 +; GFX11W32-NEXT: ; %bb.82: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX11W32-NEXT: s_branch .LBB6_84 +; GFX11W32-NEXT: .LBB6_83: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_84: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_86 +; GFX11W32-NEXT: ; %bb.85: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX11W32-NEXT: s_branch .LBB6_87 +; GFX11W32-NEXT: .LBB6_86: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_87: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_89 +; GFX11W32-NEXT: ; %bb.88: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX11W32-NEXT: s_branch .LBB6_90 +; GFX11W32-NEXT: .LBB6_89: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_90: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_92 +; GFX11W32-NEXT: ; %bb.91: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX11W32-NEXT: s_branch .LBB6_93 +; GFX11W32-NEXT: .LBB6_92: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_93: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_95 +; GFX11W32-NEXT: ; %bb.94: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX11W32-NEXT: s_branch .LBB6_96 +; GFX11W32-NEXT: .LBB6_95: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB6_96: +; GFX11W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB6_98 +; GFX11W32-NEXT: ; %bb.97: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc +; GFX11W32-NEXT: .LBB6_98: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -463,312 +463,5032 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB2_3 +; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB2_6 +; GFX8-NEXT: .LBB2_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB2_9 +; GFX8-NEXT: .LBB2_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB2_12 +; GFX8-NEXT: .LBB2_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB2_15 +; GFX8-NEXT: .LBB2_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB2_18 +; GFX8-NEXT: .LBB2_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB2_21 +; GFX8-NEXT: .LBB2_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB2_24 +; GFX8-NEXT: .LBB2_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB2_27 +; GFX8-NEXT: .LBB2_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB2_30 +; GFX8-NEXT: .LBB2_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB2_33 +; GFX8-NEXT: .LBB2_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB2_36 +; GFX8-NEXT: .LBB2_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB2_39 +; GFX8-NEXT: .LBB2_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB2_42 +; GFX8-NEXT: .LBB2_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB2_45 +; GFX8-NEXT: .LBB2_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB2_48 +; GFX8-NEXT: .LBB2_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB2_51 +; GFX8-NEXT: .LBB2_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB2_54 +; GFX8-NEXT: .LBB2_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB2_57 +; GFX8-NEXT: .LBB2_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB2_60 +; GFX8-NEXT: .LBB2_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB2_63 +; GFX8-NEXT: .LBB2_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB2_66 +; GFX8-NEXT: .LBB2_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB2_69 +; GFX8-NEXT: .LBB2_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB2_72 +; GFX8-NEXT: .LBB2_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB2_75 +; GFX8-NEXT: .LBB2_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB2_78 +; GFX8-NEXT: .LBB2_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB2_81 +; GFX8-NEXT: .LBB2_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB2_84 +; GFX8-NEXT: .LBB2_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB2_87 +; GFX8-NEXT: .LBB2_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB2_90 +; GFX8-NEXT: .LBB2_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB2_93 +; GFX8-NEXT: .LBB2_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB2_96 +; GFX8-NEXT: .LBB2_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB2_99 +; GFX8-NEXT: .LBB2_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB2_102 +; GFX8-NEXT: .LBB2_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB2_105 +; GFX8-NEXT: .LBB2_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB2_108 +; GFX8-NEXT: .LBB2_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB2_111 +; GFX8-NEXT: .LBB2_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB2_114 +; GFX8-NEXT: .LBB2_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB2_117 +; GFX8-NEXT: .LBB2_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB2_120 +; GFX8-NEXT: .LBB2_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB2_123 +; GFX8-NEXT: .LBB2_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB2_126 +; GFX8-NEXT: .LBB2_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB2_129 +; GFX8-NEXT: .LBB2_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB2_132 +; GFX8-NEXT: .LBB2_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB2_135 +; GFX8-NEXT: .LBB2_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB2_138 +; GFX8-NEXT: .LBB2_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB2_141 +; GFX8-NEXT: .LBB2_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB2_144 +; GFX8-NEXT: .LBB2_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB2_147 +; GFX8-NEXT: .LBB2_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB2_150 +; GFX8-NEXT: .LBB2_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB2_153 +; GFX8-NEXT: .LBB2_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB2_156 +; GFX8-NEXT: .LBB2_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB2_159 +; GFX8-NEXT: .LBB2_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB2_162 +; GFX8-NEXT: .LBB2_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB2_165 +; GFX8-NEXT: .LBB2_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB2_168 +; GFX8-NEXT: .LBB2_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB2_171 +; GFX8-NEXT: .LBB2_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB2_174 +; GFX8-NEXT: .LBB2_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB2_177 +; GFX8-NEXT: .LBB2_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB2_180 +; GFX8-NEXT: .LBB2_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB2_183 +; GFX8-NEXT: .LBB2_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB2_186 +; GFX8-NEXT: .LBB2_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB2_189 +; GFX8-NEXT: .LBB2_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB2_192 +; GFX8-NEXT: .LBB2_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB2_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB2_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_cbranch_execz .LBB2_194 +; GFX8-NEXT: ; %bb.193: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB2_2: +; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB2_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB2_3 +; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB2_6 +; GFX9-NEXT: .LBB2_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB2_9 +; GFX9-NEXT: .LBB2_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB2_12 +; GFX9-NEXT: .LBB2_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB2_15 +; GFX9-NEXT: .LBB2_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB2_18 +; GFX9-NEXT: .LBB2_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB2_21 +; GFX9-NEXT: .LBB2_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB2_24 +; GFX9-NEXT: .LBB2_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB2_27 +; GFX9-NEXT: .LBB2_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB2_30 +; GFX9-NEXT: .LBB2_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB2_33 +; GFX9-NEXT: .LBB2_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB2_36 +; GFX9-NEXT: .LBB2_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB2_39 +; GFX9-NEXT: .LBB2_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB2_42 +; GFX9-NEXT: .LBB2_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB2_45 +; GFX9-NEXT: .LBB2_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB2_48 +; GFX9-NEXT: .LBB2_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB2_51 +; GFX9-NEXT: .LBB2_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB2_54 +; GFX9-NEXT: .LBB2_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB2_57 +; GFX9-NEXT: .LBB2_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB2_60 +; GFX9-NEXT: .LBB2_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB2_63 +; GFX9-NEXT: .LBB2_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB2_66 +; GFX9-NEXT: .LBB2_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB2_69 +; GFX9-NEXT: .LBB2_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB2_72 +; GFX9-NEXT: .LBB2_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB2_75 +; GFX9-NEXT: .LBB2_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB2_78 +; GFX9-NEXT: .LBB2_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB2_81 +; GFX9-NEXT: .LBB2_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB2_84 +; GFX9-NEXT: .LBB2_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB2_87 +; GFX9-NEXT: .LBB2_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB2_90 +; GFX9-NEXT: .LBB2_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB2_93 +; GFX9-NEXT: .LBB2_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB2_96 +; GFX9-NEXT: .LBB2_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB2_99 +; GFX9-NEXT: .LBB2_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB2_102 +; GFX9-NEXT: .LBB2_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB2_105 +; GFX9-NEXT: .LBB2_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB2_108 +; GFX9-NEXT: .LBB2_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB2_111 +; GFX9-NEXT: .LBB2_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB2_114 +; GFX9-NEXT: .LBB2_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB2_117 +; GFX9-NEXT: .LBB2_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB2_120 +; GFX9-NEXT: .LBB2_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB2_123 +; GFX9-NEXT: .LBB2_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB2_126 +; GFX9-NEXT: .LBB2_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB2_129 +; GFX9-NEXT: .LBB2_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB2_132 +; GFX9-NEXT: .LBB2_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB2_135 +; GFX9-NEXT: .LBB2_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB2_138 +; GFX9-NEXT: .LBB2_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB2_141 +; GFX9-NEXT: .LBB2_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB2_144 +; GFX9-NEXT: .LBB2_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB2_147 +; GFX9-NEXT: .LBB2_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB2_150 +; GFX9-NEXT: .LBB2_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB2_153 +; GFX9-NEXT: .LBB2_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB2_156 +; GFX9-NEXT: .LBB2_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB2_159 +; GFX9-NEXT: .LBB2_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB2_162 +; GFX9-NEXT: .LBB2_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB2_165 +; GFX9-NEXT: .LBB2_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB2_168 +; GFX9-NEXT: .LBB2_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB2_171 +; GFX9-NEXT: .LBB2_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB2_174 +; GFX9-NEXT: .LBB2_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB2_177 +; GFX9-NEXT: .LBB2_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB2_180 +; GFX9-NEXT: .LBB2_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB2_183 +; GFX9-NEXT: .LBB2_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB2_186 +; GFX9-NEXT: .LBB2_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB2_189 +; GFX9-NEXT: .LBB2_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB2_192 +; GFX9-NEXT: .LBB2_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB2_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB2_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_cbranch_execz .LBB2_194 +; GFX9-NEXT: ; %bb.193: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_add v0, v3, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB2_2: +; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB2_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX10W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W64-NEXT: s_branch .LBB2_3 +; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_3: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX10W64-NEXT: ; %bb.4: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX10W64-NEXT: s_branch .LBB2_6 +; GFX10W64-NEXT: .LBB2_5: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_6: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX10W64-NEXT: ; %bb.7: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX10W64-NEXT: s_branch .LBB2_9 +; GFX10W64-NEXT: .LBB2_8: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_9: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX10W64-NEXT: ; %bb.10: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX10W64-NEXT: s_branch .LBB2_12 +; GFX10W64-NEXT: .LBB2_11: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_12: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX10W64-NEXT: ; %bb.13: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX10W64-NEXT: s_branch .LBB2_15 +; GFX10W64-NEXT: .LBB2_14: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_15: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX10W64-NEXT: ; %bb.16: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX10W64-NEXT: s_branch .LBB2_18 +; GFX10W64-NEXT: .LBB2_17: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_18: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX10W64-NEXT: ; %bb.19: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX10W64-NEXT: s_branch .LBB2_21 +; GFX10W64-NEXT: .LBB2_20: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_21: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX10W64-NEXT: ; %bb.22: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX10W64-NEXT: s_branch .LBB2_24 +; GFX10W64-NEXT: .LBB2_23: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_24: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX10W64-NEXT: ; %bb.25: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX10W64-NEXT: s_branch .LBB2_27 +; GFX10W64-NEXT: .LBB2_26: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_27: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX10W64-NEXT: ; %bb.28: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX10W64-NEXT: s_branch .LBB2_30 +; GFX10W64-NEXT: .LBB2_29: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_30: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX10W64-NEXT: ; %bb.31: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX10W64-NEXT: s_branch .LBB2_33 +; GFX10W64-NEXT: .LBB2_32: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_33: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX10W64-NEXT: ; %bb.34: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX10W64-NEXT: s_branch .LBB2_36 +; GFX10W64-NEXT: .LBB2_35: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_36: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX10W64-NEXT: ; %bb.37: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX10W64-NEXT: s_branch .LBB2_39 +; GFX10W64-NEXT: .LBB2_38: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_39: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX10W64-NEXT: ; %bb.40: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX10W64-NEXT: s_branch .LBB2_42 +; GFX10W64-NEXT: .LBB2_41: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_42: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX10W64-NEXT: ; %bb.43: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX10W64-NEXT: s_branch .LBB2_45 +; GFX10W64-NEXT: .LBB2_44: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_45: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX10W64-NEXT: ; %bb.46: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX10W64-NEXT: s_branch .LBB2_48 +; GFX10W64-NEXT: .LBB2_47: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_48: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX10W64-NEXT: ; %bb.49: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX10W64-NEXT: s_branch .LBB2_51 +; GFX10W64-NEXT: .LBB2_50: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_51: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX10W64-NEXT: ; %bb.52: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX10W64-NEXT: s_branch .LBB2_54 +; GFX10W64-NEXT: .LBB2_53: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_54: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX10W64-NEXT: ; %bb.55: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX10W64-NEXT: s_branch .LBB2_57 +; GFX10W64-NEXT: .LBB2_56: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_57: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX10W64-NEXT: ; %bb.58: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX10W64-NEXT: s_branch .LBB2_60 +; GFX10W64-NEXT: .LBB2_59: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_60: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX10W64-NEXT: ; %bb.61: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX10W64-NEXT: s_branch .LBB2_63 +; GFX10W64-NEXT: .LBB2_62: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_63: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX10W64-NEXT: ; %bb.64: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX10W64-NEXT: s_branch .LBB2_66 +; GFX10W64-NEXT: .LBB2_65: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_66: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX10W64-NEXT: ; %bb.67: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX10W64-NEXT: s_branch .LBB2_69 +; GFX10W64-NEXT: .LBB2_68: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_69: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX10W64-NEXT: ; %bb.70: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX10W64-NEXT: s_branch .LBB2_72 +; GFX10W64-NEXT: .LBB2_71: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_72: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX10W64-NEXT: ; %bb.73: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX10W64-NEXT: s_branch .LBB2_75 +; GFX10W64-NEXT: .LBB2_74: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_75: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX10W64-NEXT: ; %bb.76: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX10W64-NEXT: s_branch .LBB2_78 +; GFX10W64-NEXT: .LBB2_77: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_78: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX10W64-NEXT: ; %bb.79: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX10W64-NEXT: s_branch .LBB2_81 +; GFX10W64-NEXT: .LBB2_80: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_81: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX10W64-NEXT: ; %bb.82: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX10W64-NEXT: s_branch .LBB2_84 +; GFX10W64-NEXT: .LBB2_83: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_84: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX10W64-NEXT: ; %bb.85: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX10W64-NEXT: s_branch .LBB2_87 +; GFX10W64-NEXT: .LBB2_86: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_87: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX10W64-NEXT: ; %bb.88: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX10W64-NEXT: s_branch .LBB2_90 +; GFX10W64-NEXT: .LBB2_89: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_90: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX10W64-NEXT: s_add_i32 s4, s6, s2 +; GFX10W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX10W64-NEXT: s_mov_b32 s7, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX10W64-NEXT: ; %bb.91: +; GFX10W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX10W64-NEXT: s_branch .LBB2_93 +; GFX10W64-NEXT: .LBB2_92: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_93: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s5, 0 +; GFX10W64-NEXT: s_add_i32 s6, s4, s2 +; GFX10W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX10W64-NEXT: ; %bb.94: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX10W64-NEXT: s_branch .LBB2_96 +; GFX10W64-NEXT: .LBB2_95: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_96: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX10W64-NEXT: ; %bb.97: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX10W64-NEXT: s_branch .LBB2_99 +; GFX10W64-NEXT: .LBB2_98: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_99: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX10W64-NEXT: ; %bb.100: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX10W64-NEXT: s_branch .LBB2_102 +; GFX10W64-NEXT: .LBB2_101: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_102: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX10W64-NEXT: ; %bb.103: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX10W64-NEXT: s_branch .LBB2_105 +; GFX10W64-NEXT: .LBB2_104: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_105: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX10W64-NEXT: ; %bb.106: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX10W64-NEXT: s_branch .LBB2_108 +; GFX10W64-NEXT: .LBB2_107: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_108: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX10W64-NEXT: ; %bb.109: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX10W64-NEXT: s_branch .LBB2_111 +; GFX10W64-NEXT: .LBB2_110: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_111: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX10W64-NEXT: ; %bb.112: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX10W64-NEXT: s_branch .LBB2_114 +; GFX10W64-NEXT: .LBB2_113: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_114: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX10W64-NEXT: ; %bb.115: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX10W64-NEXT: s_branch .LBB2_117 +; GFX10W64-NEXT: .LBB2_116: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_117: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX10W64-NEXT: ; %bb.118: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX10W64-NEXT: s_branch .LBB2_120 +; GFX10W64-NEXT: .LBB2_119: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_120: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX10W64-NEXT: ; %bb.121: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX10W64-NEXT: s_branch .LBB2_123 +; GFX10W64-NEXT: .LBB2_122: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_123: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX10W64-NEXT: ; %bb.124: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX10W64-NEXT: s_branch .LBB2_126 +; GFX10W64-NEXT: .LBB2_125: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_126: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX10W64-NEXT: ; %bb.127: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX10W64-NEXT: s_branch .LBB2_129 +; GFX10W64-NEXT: .LBB2_128: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_129: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX10W64-NEXT: ; %bb.130: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX10W64-NEXT: s_branch .LBB2_132 +; GFX10W64-NEXT: .LBB2_131: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_132: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX10W64-NEXT: ; %bb.133: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX10W64-NEXT: s_branch .LBB2_135 +; GFX10W64-NEXT: .LBB2_134: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_135: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX10W64-NEXT: ; %bb.136: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX10W64-NEXT: s_branch .LBB2_138 +; GFX10W64-NEXT: .LBB2_137: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_138: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX10W64-NEXT: ; %bb.139: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX10W64-NEXT: s_branch .LBB2_141 +; GFX10W64-NEXT: .LBB2_140: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_141: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX10W64-NEXT: ; %bb.142: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX10W64-NEXT: s_branch .LBB2_144 +; GFX10W64-NEXT: .LBB2_143: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_144: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX10W64-NEXT: ; %bb.145: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX10W64-NEXT: s_branch .LBB2_147 +; GFX10W64-NEXT: .LBB2_146: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_147: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX10W64-NEXT: ; %bb.148: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX10W64-NEXT: s_branch .LBB2_150 +; GFX10W64-NEXT: .LBB2_149: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_150: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX10W64-NEXT: ; %bb.151: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX10W64-NEXT: s_branch .LBB2_153 +; GFX10W64-NEXT: .LBB2_152: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_153: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX10W64-NEXT: ; %bb.154: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX10W64-NEXT: s_branch .LBB2_156 +; GFX10W64-NEXT: .LBB2_155: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_156: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX10W64-NEXT: ; %bb.157: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX10W64-NEXT: s_branch .LBB2_159 +; GFX10W64-NEXT: .LBB2_158: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_159: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX10W64-NEXT: ; %bb.160: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX10W64-NEXT: s_branch .LBB2_162 +; GFX10W64-NEXT: .LBB2_161: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_162: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX10W64-NEXT: ; %bb.163: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX10W64-NEXT: s_branch .LBB2_165 +; GFX10W64-NEXT: .LBB2_164: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_165: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX10W64-NEXT: ; %bb.166: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX10W64-NEXT: s_branch .LBB2_168 +; GFX10W64-NEXT: .LBB2_167: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_168: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX10W64-NEXT: ; %bb.169: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX10W64-NEXT: s_branch .LBB2_171 +; GFX10W64-NEXT: .LBB2_170: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_171: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX10W64-NEXT: ; %bb.172: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX10W64-NEXT: s_branch .LBB2_174 +; GFX10W64-NEXT: .LBB2_173: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_174: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX10W64-NEXT: ; %bb.175: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX10W64-NEXT: s_branch .LBB2_177 +; GFX10W64-NEXT: .LBB2_176: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_177: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX10W64-NEXT: ; %bb.178: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX10W64-NEXT: s_branch .LBB2_180 +; GFX10W64-NEXT: .LBB2_179: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_180: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX10W64-NEXT: ; %bb.181: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX10W64-NEXT: s_branch .LBB2_183 +; GFX10W64-NEXT: .LBB2_182: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_183: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX10W64-NEXT: ; %bb.184: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX10W64-NEXT: s_branch .LBB2_186 +; GFX10W64-NEXT: .LBB2_185: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_186: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX10W64-NEXT: ; %bb.187: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX10W64-NEXT: s_branch .LBB2_189 +; GFX10W64-NEXT: .LBB2_188: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_189: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX10W64-NEXT: ; %bb.190: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX10W64-NEXT: s_branch .LBB2_192 +; GFX10W64-NEXT: .LBB2_191: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB2_192: +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_cbranch_execz .LBB2_194 +; GFX10W64-NEXT: ; %bb.193: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX10W64-NEXT: s_add_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_add v4, v0, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB2_2: +; GFX10W64-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB2_194: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB2_2 +; GFX10W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX10W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_add v4, v0, s[4:7], 0 idxen glc +; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W32-NEXT: s_branch .LBB2_3 ; GFX10W32-NEXT: .LBB2_2: -; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 -; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] -; GFX10W32-NEXT: s_endpgm -; -; GFX11W64-LABEL: add_i32_varying_vdata: -; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 -; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W64-NEXT: ; %bb.1: -; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_add_u32 v4, v0, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB2_2: -; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_e32 v4, s2, v4 -; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] -; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_3: +; GFX10W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX10W32-NEXT: ; %bb.4: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX10W32-NEXT: s_branch .LBB2_6 +; GFX10W32-NEXT: .LBB2_5: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_6: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX10W32-NEXT: ; %bb.7: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX10W32-NEXT: s_branch .LBB2_9 +; GFX10W32-NEXT: .LBB2_8: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_9: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX10W32-NEXT: ; %bb.10: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX10W32-NEXT: s_branch .LBB2_12 +; GFX10W32-NEXT: .LBB2_11: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_12: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX10W32-NEXT: ; %bb.13: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX10W32-NEXT: s_branch .LBB2_15 +; GFX10W32-NEXT: .LBB2_14: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_15: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX10W32-NEXT: ; %bb.16: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX10W32-NEXT: s_branch .LBB2_18 +; GFX10W32-NEXT: .LBB2_17: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_18: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX10W32-NEXT: ; %bb.19: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX10W32-NEXT: s_branch .LBB2_21 +; GFX10W32-NEXT: .LBB2_20: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_21: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX10W32-NEXT: ; %bb.22: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX10W32-NEXT: s_branch .LBB2_24 +; GFX10W32-NEXT: .LBB2_23: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_24: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX10W32-NEXT: ; %bb.25: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX10W32-NEXT: s_branch .LBB2_27 +; GFX10W32-NEXT: .LBB2_26: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_27: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX10W32-NEXT: ; %bb.28: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX10W32-NEXT: s_branch .LBB2_30 +; GFX10W32-NEXT: .LBB2_29: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_30: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX10W32-NEXT: ; %bb.31: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX10W32-NEXT: s_branch .LBB2_33 +; GFX10W32-NEXT: .LBB2_32: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_33: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX10W32-NEXT: ; %bb.34: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX10W32-NEXT: s_branch .LBB2_36 +; GFX10W32-NEXT: .LBB2_35: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_36: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX10W32-NEXT: ; %bb.37: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX10W32-NEXT: s_branch .LBB2_39 +; GFX10W32-NEXT: .LBB2_38: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_39: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX10W32-NEXT: ; %bb.40: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX10W32-NEXT: s_branch .LBB2_42 +; GFX10W32-NEXT: .LBB2_41: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_42: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX10W32-NEXT: ; %bb.43: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX10W32-NEXT: s_branch .LBB2_45 +; GFX10W32-NEXT: .LBB2_44: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_45: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX10W32-NEXT: ; %bb.46: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX10W32-NEXT: s_branch .LBB2_48 +; GFX10W32-NEXT: .LBB2_47: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_48: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX10W32-NEXT: ; %bb.49: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX10W32-NEXT: s_branch .LBB2_51 +; GFX10W32-NEXT: .LBB2_50: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_51: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX10W32-NEXT: ; %bb.52: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX10W32-NEXT: s_branch .LBB2_54 +; GFX10W32-NEXT: .LBB2_53: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_54: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX10W32-NEXT: ; %bb.55: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX10W32-NEXT: s_branch .LBB2_57 +; GFX10W32-NEXT: .LBB2_56: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_57: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX10W32-NEXT: ; %bb.58: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX10W32-NEXT: s_branch .LBB2_60 +; GFX10W32-NEXT: .LBB2_59: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_60: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX10W32-NEXT: ; %bb.61: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX10W32-NEXT: s_branch .LBB2_63 +; GFX10W32-NEXT: .LBB2_62: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_63: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX10W32-NEXT: ; %bb.64: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX10W32-NEXT: s_branch .LBB2_66 +; GFX10W32-NEXT: .LBB2_65: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_66: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX10W32-NEXT: ; %bb.67: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX10W32-NEXT: s_branch .LBB2_69 +; GFX10W32-NEXT: .LBB2_68: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_69: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX10W32-NEXT: ; %bb.70: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX10W32-NEXT: s_branch .LBB2_72 +; GFX10W32-NEXT: .LBB2_71: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_72: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX10W32-NEXT: ; %bb.73: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX10W32-NEXT: s_branch .LBB2_75 +; GFX10W32-NEXT: .LBB2_74: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_75: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX10W32-NEXT: ; %bb.76: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX10W32-NEXT: s_branch .LBB2_78 +; GFX10W32-NEXT: .LBB2_77: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_78: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX10W32-NEXT: ; %bb.79: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX10W32-NEXT: s_branch .LBB2_81 +; GFX10W32-NEXT: .LBB2_80: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_81: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX10W32-NEXT: ; %bb.82: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX10W32-NEXT: s_branch .LBB2_84 +; GFX10W32-NEXT: .LBB2_83: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_84: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX10W32-NEXT: ; %bb.85: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX10W32-NEXT: s_branch .LBB2_87 +; GFX10W32-NEXT: .LBB2_86: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_87: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX10W32-NEXT: ; %bb.88: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX10W32-NEXT: s_branch .LBB2_90 +; GFX10W32-NEXT: .LBB2_89: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_90: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX10W32-NEXT: ; %bb.91: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX10W32-NEXT: s_branch .LBB2_93 +; GFX10W32-NEXT: .LBB2_92: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_93: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX10W32-NEXT: ; %bb.94: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX10W32-NEXT: s_branch .LBB2_96 +; GFX10W32-NEXT: .LBB2_95: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB2_96: +; GFX10W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_cbranch_execz .LBB2_98 +; GFX10W32-NEXT: ; %bb.97: +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W32-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc +; GFX10W32-NEXT: .LBB2_98: +; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10W32-NEXT: s_waitcnt vmcnt(0) +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1 +; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10W32-NEXT: s_endpgm +; +; GFX11W64-LABEL: add_i32_varying_vdata: +; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX11W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 +; GFX11W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W64-NEXT: s_branch .LBB2_3 +; GFX11W64-NEXT: .LBB2_2: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_3: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX11W64-NEXT: ; %bb.4: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX11W64-NEXT: s_branch .LBB2_6 +; GFX11W64-NEXT: .LBB2_5: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_6: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX11W64-NEXT: ; %bb.7: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX11W64-NEXT: s_branch .LBB2_9 +; GFX11W64-NEXT: .LBB2_8: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_9: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX11W64-NEXT: ; %bb.10: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX11W64-NEXT: s_branch .LBB2_12 +; GFX11W64-NEXT: .LBB2_11: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_12: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX11W64-NEXT: ; %bb.13: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX11W64-NEXT: s_branch .LBB2_15 +; GFX11W64-NEXT: .LBB2_14: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_15: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX11W64-NEXT: ; %bb.16: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX11W64-NEXT: s_branch .LBB2_18 +; GFX11W64-NEXT: .LBB2_17: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_18: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX11W64-NEXT: ; %bb.19: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX11W64-NEXT: s_branch .LBB2_21 +; GFX11W64-NEXT: .LBB2_20: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_21: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX11W64-NEXT: ; %bb.22: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX11W64-NEXT: s_branch .LBB2_24 +; GFX11W64-NEXT: .LBB2_23: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_24: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX11W64-NEXT: ; %bb.25: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX11W64-NEXT: s_branch .LBB2_27 +; GFX11W64-NEXT: .LBB2_26: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_27: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX11W64-NEXT: ; %bb.28: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX11W64-NEXT: s_branch .LBB2_30 +; GFX11W64-NEXT: .LBB2_29: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_30: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX11W64-NEXT: ; %bb.31: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX11W64-NEXT: s_branch .LBB2_33 +; GFX11W64-NEXT: .LBB2_32: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_33: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX11W64-NEXT: ; %bb.34: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX11W64-NEXT: s_branch .LBB2_36 +; GFX11W64-NEXT: .LBB2_35: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_36: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX11W64-NEXT: ; %bb.37: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX11W64-NEXT: s_branch .LBB2_39 +; GFX11W64-NEXT: .LBB2_38: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_39: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX11W64-NEXT: ; %bb.40: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX11W64-NEXT: s_branch .LBB2_42 +; GFX11W64-NEXT: .LBB2_41: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_42: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX11W64-NEXT: ; %bb.43: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX11W64-NEXT: s_branch .LBB2_45 +; GFX11W64-NEXT: .LBB2_44: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_45: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX11W64-NEXT: ; %bb.46: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX11W64-NEXT: s_branch .LBB2_48 +; GFX11W64-NEXT: .LBB2_47: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_48: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX11W64-NEXT: ; %bb.49: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX11W64-NEXT: s_branch .LBB2_51 +; GFX11W64-NEXT: .LBB2_50: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_51: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX11W64-NEXT: ; %bb.52: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX11W64-NEXT: s_branch .LBB2_54 +; GFX11W64-NEXT: .LBB2_53: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_54: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX11W64-NEXT: ; %bb.55: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX11W64-NEXT: s_branch .LBB2_57 +; GFX11W64-NEXT: .LBB2_56: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_57: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX11W64-NEXT: ; %bb.58: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX11W64-NEXT: s_branch .LBB2_60 +; GFX11W64-NEXT: .LBB2_59: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_60: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX11W64-NEXT: ; %bb.61: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX11W64-NEXT: s_branch .LBB2_63 +; GFX11W64-NEXT: .LBB2_62: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_63: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX11W64-NEXT: ; %bb.64: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX11W64-NEXT: s_branch .LBB2_66 +; GFX11W64-NEXT: .LBB2_65: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_66: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX11W64-NEXT: ; %bb.67: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX11W64-NEXT: s_branch .LBB2_69 +; GFX11W64-NEXT: .LBB2_68: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_69: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX11W64-NEXT: ; %bb.70: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX11W64-NEXT: s_branch .LBB2_72 +; GFX11W64-NEXT: .LBB2_71: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_72: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX11W64-NEXT: ; %bb.73: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX11W64-NEXT: s_branch .LBB2_75 +; GFX11W64-NEXT: .LBB2_74: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_75: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX11W64-NEXT: ; %bb.76: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX11W64-NEXT: s_branch .LBB2_78 +; GFX11W64-NEXT: .LBB2_77: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_78: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX11W64-NEXT: ; %bb.79: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX11W64-NEXT: s_branch .LBB2_81 +; GFX11W64-NEXT: .LBB2_80: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_81: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX11W64-NEXT: ; %bb.82: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX11W64-NEXT: s_branch .LBB2_84 +; GFX11W64-NEXT: .LBB2_83: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_84: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX11W64-NEXT: ; %bb.85: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX11W64-NEXT: s_branch .LBB2_87 +; GFX11W64-NEXT: .LBB2_86: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_87: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX11W64-NEXT: ; %bb.88: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX11W64-NEXT: s_branch .LBB2_90 +; GFX11W64-NEXT: .LBB2_89: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_90: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX11W64-NEXT: s_add_i32 s4, s6, s2 +; GFX11W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX11W64-NEXT: s_mov_b32 s7, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX11W64-NEXT: ; %bb.91: +; GFX11W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX11W64-NEXT: s_branch .LBB2_93 +; GFX11W64-NEXT: .LBB2_92: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_93: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s5, 0 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: s_add_i32 s6, s4, s2 +; GFX11W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX11W64-NEXT: ; %bb.94: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX11W64-NEXT: s_branch .LBB2_96 +; GFX11W64-NEXT: .LBB2_95: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_96: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_98 +; GFX11W64-NEXT: ; %bb.97: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX11W64-NEXT: s_branch .LBB2_99 +; GFX11W64-NEXT: .LBB2_98: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_99: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_101 +; GFX11W64-NEXT: ; %bb.100: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX11W64-NEXT: s_branch .LBB2_102 +; GFX11W64-NEXT: .LBB2_101: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_102: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_104 +; GFX11W64-NEXT: ; %bb.103: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX11W64-NEXT: s_branch .LBB2_105 +; GFX11W64-NEXT: .LBB2_104: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_105: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_107 +; GFX11W64-NEXT: ; %bb.106: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX11W64-NEXT: s_branch .LBB2_108 +; GFX11W64-NEXT: .LBB2_107: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_108: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_110 +; GFX11W64-NEXT: ; %bb.109: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX11W64-NEXT: s_branch .LBB2_111 +; GFX11W64-NEXT: .LBB2_110: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_111: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_113 +; GFX11W64-NEXT: ; %bb.112: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX11W64-NEXT: s_branch .LBB2_114 +; GFX11W64-NEXT: .LBB2_113: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_114: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_116 +; GFX11W64-NEXT: ; %bb.115: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX11W64-NEXT: s_branch .LBB2_117 +; GFX11W64-NEXT: .LBB2_116: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_117: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_119 +; GFX11W64-NEXT: ; %bb.118: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX11W64-NEXT: s_branch .LBB2_120 +; GFX11W64-NEXT: .LBB2_119: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_120: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_122 +; GFX11W64-NEXT: ; %bb.121: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX11W64-NEXT: s_branch .LBB2_123 +; GFX11W64-NEXT: .LBB2_122: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_123: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_125 +; GFX11W64-NEXT: ; %bb.124: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX11W64-NEXT: s_branch .LBB2_126 +; GFX11W64-NEXT: .LBB2_125: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_126: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_128 +; GFX11W64-NEXT: ; %bb.127: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX11W64-NEXT: s_branch .LBB2_129 +; GFX11W64-NEXT: .LBB2_128: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_129: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_131 +; GFX11W64-NEXT: ; %bb.130: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX11W64-NEXT: s_branch .LBB2_132 +; GFX11W64-NEXT: .LBB2_131: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_132: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_134 +; GFX11W64-NEXT: ; %bb.133: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX11W64-NEXT: s_branch .LBB2_135 +; GFX11W64-NEXT: .LBB2_134: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_135: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_137 +; GFX11W64-NEXT: ; %bb.136: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX11W64-NEXT: s_branch .LBB2_138 +; GFX11W64-NEXT: .LBB2_137: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_138: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_140 +; GFX11W64-NEXT: ; %bb.139: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX11W64-NEXT: s_branch .LBB2_141 +; GFX11W64-NEXT: .LBB2_140: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_141: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_143 +; GFX11W64-NEXT: ; %bb.142: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX11W64-NEXT: s_branch .LBB2_144 +; GFX11W64-NEXT: .LBB2_143: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_144: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_146 +; GFX11W64-NEXT: ; %bb.145: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX11W64-NEXT: s_branch .LBB2_147 +; GFX11W64-NEXT: .LBB2_146: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_147: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_149 +; GFX11W64-NEXT: ; %bb.148: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX11W64-NEXT: s_branch .LBB2_150 +; GFX11W64-NEXT: .LBB2_149: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_150: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_152 +; GFX11W64-NEXT: ; %bb.151: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX11W64-NEXT: s_branch .LBB2_153 +; GFX11W64-NEXT: .LBB2_152: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_153: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_155 +; GFX11W64-NEXT: ; %bb.154: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX11W64-NEXT: s_branch .LBB2_156 +; GFX11W64-NEXT: .LBB2_155: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_156: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_158 +; GFX11W64-NEXT: ; %bb.157: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX11W64-NEXT: s_branch .LBB2_159 +; GFX11W64-NEXT: .LBB2_158: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_159: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_161 +; GFX11W64-NEXT: ; %bb.160: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX11W64-NEXT: s_branch .LBB2_162 +; GFX11W64-NEXT: .LBB2_161: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_162: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_164 +; GFX11W64-NEXT: ; %bb.163: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX11W64-NEXT: s_branch .LBB2_165 +; GFX11W64-NEXT: .LBB2_164: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_165: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_167 +; GFX11W64-NEXT: ; %bb.166: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX11W64-NEXT: s_branch .LBB2_168 +; GFX11W64-NEXT: .LBB2_167: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_168: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_170 +; GFX11W64-NEXT: ; %bb.169: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX11W64-NEXT: s_branch .LBB2_171 +; GFX11W64-NEXT: .LBB2_170: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_171: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_173 +; GFX11W64-NEXT: ; %bb.172: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX11W64-NEXT: s_branch .LBB2_174 +; GFX11W64-NEXT: .LBB2_173: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_174: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_176 +; GFX11W64-NEXT: ; %bb.175: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX11W64-NEXT: s_branch .LBB2_177 +; GFX11W64-NEXT: .LBB2_176: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_177: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_179 +; GFX11W64-NEXT: ; %bb.178: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX11W64-NEXT: s_branch .LBB2_180 +; GFX11W64-NEXT: .LBB2_179: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_180: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_182 +; GFX11W64-NEXT: ; %bb.181: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX11W64-NEXT: s_branch .LBB2_183 +; GFX11W64-NEXT: .LBB2_182: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_183: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_185 +; GFX11W64-NEXT: ; %bb.184: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX11W64-NEXT: s_branch .LBB2_186 +; GFX11W64-NEXT: .LBB2_185: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_186: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_188 +; GFX11W64-NEXT: ; %bb.187: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX11W64-NEXT: s_branch .LBB2_189 +; GFX11W64-NEXT: .LBB2_188: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_189: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_191 +; GFX11W64-NEXT: ; %bb.190: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX11W64-NEXT: s_branch .LBB2_192 +; GFX11W64-NEXT: .LBB2_191: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB2_192: +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: s_cbranch_execz .LBB2_194 +; GFX11W64-NEXT: ; %bb.193: +; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W64-NEXT: s_add_i32 s4, s6, s4 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB2_194: +; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11W64-NEXT: s_waitcnt vmcnt(0) +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1 +; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX11W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_2 +; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W32-NEXT: s_branch .LBB2_3 +; GFX11W32-NEXT: .LBB2_2: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_3: +; GFX11W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_5 +; GFX11W32-NEXT: ; %bb.4: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX11W32-NEXT: s_branch .LBB2_6 +; GFX11W32-NEXT: .LBB2_5: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_6: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_8 +; GFX11W32-NEXT: ; %bb.7: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX11W32-NEXT: s_branch .LBB2_9 +; GFX11W32-NEXT: .LBB2_8: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_9: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_11 +; GFX11W32-NEXT: ; %bb.10: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX11W32-NEXT: s_branch .LBB2_12 +; GFX11W32-NEXT: .LBB2_11: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_12: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_14 +; GFX11W32-NEXT: ; %bb.13: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX11W32-NEXT: s_branch .LBB2_15 +; GFX11W32-NEXT: .LBB2_14: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_15: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_17 +; GFX11W32-NEXT: ; %bb.16: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX11W32-NEXT: s_branch .LBB2_18 +; GFX11W32-NEXT: .LBB2_17: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_18: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_20 +; GFX11W32-NEXT: ; %bb.19: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX11W32-NEXT: s_branch .LBB2_21 +; GFX11W32-NEXT: .LBB2_20: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_21: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_23 +; GFX11W32-NEXT: ; %bb.22: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX11W32-NEXT: s_branch .LBB2_24 +; GFX11W32-NEXT: .LBB2_23: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_24: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_26 +; GFX11W32-NEXT: ; %bb.25: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX11W32-NEXT: s_branch .LBB2_27 +; GFX11W32-NEXT: .LBB2_26: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_27: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_29 +; GFX11W32-NEXT: ; %bb.28: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX11W32-NEXT: s_branch .LBB2_30 +; GFX11W32-NEXT: .LBB2_29: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_30: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_32 +; GFX11W32-NEXT: ; %bb.31: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX11W32-NEXT: s_branch .LBB2_33 +; GFX11W32-NEXT: .LBB2_32: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_33: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_35 +; GFX11W32-NEXT: ; %bb.34: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX11W32-NEXT: s_branch .LBB2_36 +; GFX11W32-NEXT: .LBB2_35: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_36: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_38 +; GFX11W32-NEXT: ; %bb.37: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX11W32-NEXT: s_branch .LBB2_39 +; GFX11W32-NEXT: .LBB2_38: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_39: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_41 +; GFX11W32-NEXT: ; %bb.40: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX11W32-NEXT: s_branch .LBB2_42 +; GFX11W32-NEXT: .LBB2_41: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_42: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_44 +; GFX11W32-NEXT: ; %bb.43: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX11W32-NEXT: s_branch .LBB2_45 +; GFX11W32-NEXT: .LBB2_44: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_45: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_47 +; GFX11W32-NEXT: ; %bb.46: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX11W32-NEXT: s_branch .LBB2_48 +; GFX11W32-NEXT: .LBB2_47: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_48: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_50 +; GFX11W32-NEXT: ; %bb.49: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX11W32-NEXT: s_branch .LBB2_51 +; GFX11W32-NEXT: .LBB2_50: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_51: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_53 +; GFX11W32-NEXT: ; %bb.52: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX11W32-NEXT: s_branch .LBB2_54 +; GFX11W32-NEXT: .LBB2_53: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_54: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_56 +; GFX11W32-NEXT: ; %bb.55: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX11W32-NEXT: s_branch .LBB2_57 +; GFX11W32-NEXT: .LBB2_56: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_57: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_59 +; GFX11W32-NEXT: ; %bb.58: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX11W32-NEXT: s_branch .LBB2_60 +; GFX11W32-NEXT: .LBB2_59: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_60: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_62 +; GFX11W32-NEXT: ; %bb.61: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX11W32-NEXT: s_branch .LBB2_63 +; GFX11W32-NEXT: .LBB2_62: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_63: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_65 +; GFX11W32-NEXT: ; %bb.64: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX11W32-NEXT: s_branch .LBB2_66 +; GFX11W32-NEXT: .LBB2_65: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_66: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_68 +; GFX11W32-NEXT: ; %bb.67: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX11W32-NEXT: s_branch .LBB2_69 +; GFX11W32-NEXT: .LBB2_68: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_69: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_71 +; GFX11W32-NEXT: ; %bb.70: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX11W32-NEXT: s_branch .LBB2_72 +; GFX11W32-NEXT: .LBB2_71: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_72: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_74 +; GFX11W32-NEXT: ; %bb.73: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX11W32-NEXT: s_branch .LBB2_75 +; GFX11W32-NEXT: .LBB2_74: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_75: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_77 +; GFX11W32-NEXT: ; %bb.76: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX11W32-NEXT: s_branch .LBB2_78 +; GFX11W32-NEXT: .LBB2_77: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_78: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_80 +; GFX11W32-NEXT: ; %bb.79: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX11W32-NEXT: s_branch .LBB2_81 +; GFX11W32-NEXT: .LBB2_80: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_81: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_83 +; GFX11W32-NEXT: ; %bb.82: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX11W32-NEXT: s_branch .LBB2_84 +; GFX11W32-NEXT: .LBB2_83: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_84: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_86 +; GFX11W32-NEXT: ; %bb.85: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX11W32-NEXT: s_branch .LBB2_87 +; GFX11W32-NEXT: .LBB2_86: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_87: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_89 +; GFX11W32-NEXT: ; %bb.88: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX11W32-NEXT: s_branch .LBB2_90 +; GFX11W32-NEXT: .LBB2_89: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_90: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_92 +; GFX11W32-NEXT: ; %bb.91: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX11W32-NEXT: s_branch .LBB2_93 +; GFX11W32-NEXT: .LBB2_92: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_93: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_95 +; GFX11W32-NEXT: ; %bb.94: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX11W32-NEXT: s_branch .LBB2_96 +; GFX11W32-NEXT: .LBB2_95: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB2_96: +; GFX11W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB2_98 +; GFX11W32-NEXT: ; %bb.97: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB2_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_add_u32 v4, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB2_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W32-NEXT: .LBB2_98: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: @@ -1411,312 +6131,5033 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX8-NEXT: s_mov_b32 s3, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 0 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v1, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v1, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX8-NEXT: v_writelane_b32 v1, 0, 0 +; GFX8-NEXT: s_branch .LBB7_3 +; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_3: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s6, s2, 0 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 1 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX8-NEXT: ; %bb.4: +; GFX8-NEXT: v_writelane_b32 v1, s6, 1 +; GFX8-NEXT: s_branch .LBB7_6 +; GFX8-NEXT: .LBB7_5: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_6: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 2 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX8-NEXT: ; %bb.7: +; GFX8-NEXT: v_writelane_b32 v1, s6, 2 +; GFX8-NEXT: s_branch .LBB7_9 +; GFX8-NEXT: .LBB7_8: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_9: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 3 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX8-NEXT: ; %bb.10: +; GFX8-NEXT: v_writelane_b32 v1, s6, 3 +; GFX8-NEXT: s_branch .LBB7_12 +; GFX8-NEXT: .LBB7_11: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_12: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 4 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX8-NEXT: ; %bb.13: +; GFX8-NEXT: v_writelane_b32 v1, s6, 4 +; GFX8-NEXT: s_branch .LBB7_15 +; GFX8-NEXT: .LBB7_14: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_15: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 5 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX8-NEXT: ; %bb.16: +; GFX8-NEXT: v_writelane_b32 v1, s6, 5 +; GFX8-NEXT: s_branch .LBB7_18 +; GFX8-NEXT: .LBB7_17: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_18: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 6 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX8-NEXT: ; %bb.19: +; GFX8-NEXT: v_writelane_b32 v1, s6, 6 +; GFX8-NEXT: s_branch .LBB7_21 +; GFX8-NEXT: .LBB7_20: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_21: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 7 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX8-NEXT: ; %bb.22: +; GFX8-NEXT: v_writelane_b32 v1, s6, 7 +; GFX8-NEXT: s_branch .LBB7_24 +; GFX8-NEXT: .LBB7_23: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_24: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 8 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX8-NEXT: ; %bb.25: +; GFX8-NEXT: v_writelane_b32 v1, s6, 8 +; GFX8-NEXT: s_branch .LBB7_27 +; GFX8-NEXT: .LBB7_26: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_27: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 9 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX8-NEXT: ; %bb.28: +; GFX8-NEXT: v_writelane_b32 v1, s6, 9 +; GFX8-NEXT: s_branch .LBB7_30 +; GFX8-NEXT: .LBB7_29: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_30: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 10 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX8-NEXT: ; %bb.31: +; GFX8-NEXT: v_writelane_b32 v1, s6, 10 +; GFX8-NEXT: s_branch .LBB7_33 +; GFX8-NEXT: .LBB7_32: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_33: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 11 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX8-NEXT: ; %bb.34: +; GFX8-NEXT: v_writelane_b32 v1, s6, 11 +; GFX8-NEXT: s_branch .LBB7_36 +; GFX8-NEXT: .LBB7_35: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_36: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 12 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX8-NEXT: ; %bb.37: +; GFX8-NEXT: v_writelane_b32 v1, s6, 12 +; GFX8-NEXT: s_branch .LBB7_39 +; GFX8-NEXT: .LBB7_38: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_39: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 13 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX8-NEXT: ; %bb.40: +; GFX8-NEXT: v_writelane_b32 v1, s6, 13 +; GFX8-NEXT: s_branch .LBB7_42 +; GFX8-NEXT: .LBB7_41: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_42: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 14 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX8-NEXT: ; %bb.43: +; GFX8-NEXT: v_writelane_b32 v1, s6, 14 +; GFX8-NEXT: s_branch .LBB7_45 +; GFX8-NEXT: .LBB7_44: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_45: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 15 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX8-NEXT: ; %bb.46: +; GFX8-NEXT: v_writelane_b32 v1, s6, 15 +; GFX8-NEXT: s_branch .LBB7_48 +; GFX8-NEXT: .LBB7_47: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_48: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 16 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX8-NEXT: ; %bb.49: +; GFX8-NEXT: v_writelane_b32 v1, s6, 16 +; GFX8-NEXT: s_branch .LBB7_51 +; GFX8-NEXT: .LBB7_50: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_51: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 17 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX8-NEXT: ; %bb.52: +; GFX8-NEXT: v_writelane_b32 v1, s6, 17 +; GFX8-NEXT: s_branch .LBB7_54 +; GFX8-NEXT: .LBB7_53: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_54: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 18 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX8-NEXT: ; %bb.55: +; GFX8-NEXT: v_writelane_b32 v1, s6, 18 +; GFX8-NEXT: s_branch .LBB7_57 +; GFX8-NEXT: .LBB7_56: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_57: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 19 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX8-NEXT: ; %bb.58: +; GFX8-NEXT: v_writelane_b32 v1, s6, 19 +; GFX8-NEXT: s_branch .LBB7_60 +; GFX8-NEXT: .LBB7_59: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_60: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 20 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX8-NEXT: ; %bb.61: +; GFX8-NEXT: v_writelane_b32 v1, s6, 20 +; GFX8-NEXT: s_branch .LBB7_63 +; GFX8-NEXT: .LBB7_62: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_63: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 21 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX8-NEXT: ; %bb.64: +; GFX8-NEXT: v_writelane_b32 v1, s6, 21 +; GFX8-NEXT: s_branch .LBB7_66 +; GFX8-NEXT: .LBB7_65: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_66: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 22 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX8-NEXT: ; %bb.67: +; GFX8-NEXT: v_writelane_b32 v1, s6, 22 +; GFX8-NEXT: s_branch .LBB7_69 +; GFX8-NEXT: .LBB7_68: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_69: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 23 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX8-NEXT: ; %bb.70: +; GFX8-NEXT: v_writelane_b32 v1, s6, 23 +; GFX8-NEXT: s_branch .LBB7_72 +; GFX8-NEXT: .LBB7_71: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_72: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 24 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX8-NEXT: ; %bb.73: +; GFX8-NEXT: v_writelane_b32 v1, s6, 24 +; GFX8-NEXT: s_branch .LBB7_75 +; GFX8-NEXT: .LBB7_74: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_75: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 25 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX8-NEXT: ; %bb.76: +; GFX8-NEXT: v_writelane_b32 v1, s6, 25 +; GFX8-NEXT: s_branch .LBB7_78 +; GFX8-NEXT: .LBB7_77: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_78: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 26 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX8-NEXT: ; %bb.79: +; GFX8-NEXT: v_writelane_b32 v1, s6, 26 +; GFX8-NEXT: s_branch .LBB7_81 +; GFX8-NEXT: .LBB7_80: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_81: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 27 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX8-NEXT: ; %bb.82: +; GFX8-NEXT: v_writelane_b32 v1, s6, 27 +; GFX8-NEXT: s_branch .LBB7_84 +; GFX8-NEXT: .LBB7_83: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_84: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 28 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX8-NEXT: ; %bb.85: +; GFX8-NEXT: v_writelane_b32 v1, s6, 28 +; GFX8-NEXT: s_branch .LBB7_87 +; GFX8-NEXT: .LBB7_86: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_87: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s2 +; GFX8-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s2, v0, 29 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX8-NEXT: ; %bb.88: +; GFX8-NEXT: v_writelane_b32 v1, s6, 29 +; GFX8-NEXT: s_branch .LBB7_90 +; GFX8-NEXT: .LBB7_89: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_90: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s2, s2, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s2 +; GFX8-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX8-NEXT: s_mov_b32 s7, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX8-NEXT: v_readlane_b32 s5, v0, 30 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX8-NEXT: ; %bb.91: +; GFX8-NEXT: v_writelane_b32 v1, s4, 30 +; GFX8-NEXT: s_branch .LBB7_93 +; GFX8-NEXT: .LBB7_92: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_93: +; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX8-NEXT: s_cselect_b32 s2, s5, 0 +; GFX8-NEXT: s_add_i32 s6, s4, s2 +; GFX8-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX8-NEXT: v_readlane_b32 s3, v0, 31 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX8-NEXT: ; %bb.94: +; GFX8-NEXT: v_writelane_b32 v1, s6, 31 +; GFX8-NEXT: s_branch .LBB7_96 +; GFX8-NEXT: .LBB7_95: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_96: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX8-NEXT: s_mov_b32 s2, 0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 32 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_98 +; GFX8-NEXT: ; %bb.97: +; GFX8-NEXT: v_writelane_b32 v1, s6, 32 +; GFX8-NEXT: s_branch .LBB7_99 +; GFX8-NEXT: .LBB7_98: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_99: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 33 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_101 +; GFX8-NEXT: ; %bb.100: +; GFX8-NEXT: v_writelane_b32 v1, s6, 33 +; GFX8-NEXT: s_branch .LBB7_102 +; GFX8-NEXT: .LBB7_101: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_102: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 34 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_104 +; GFX8-NEXT: ; %bb.103: +; GFX8-NEXT: v_writelane_b32 v1, s6, 34 +; GFX8-NEXT: s_branch .LBB7_105 +; GFX8-NEXT: .LBB7_104: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_105: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 35 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_107 +; GFX8-NEXT: ; %bb.106: +; GFX8-NEXT: v_writelane_b32 v1, s6, 35 +; GFX8-NEXT: s_branch .LBB7_108 +; GFX8-NEXT: .LBB7_107: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_108: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 36 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_110 +; GFX8-NEXT: ; %bb.109: +; GFX8-NEXT: v_writelane_b32 v1, s6, 36 +; GFX8-NEXT: s_branch .LBB7_111 +; GFX8-NEXT: .LBB7_110: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_111: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 37 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_113 +; GFX8-NEXT: ; %bb.112: +; GFX8-NEXT: v_writelane_b32 v1, s6, 37 +; GFX8-NEXT: s_branch .LBB7_114 +; GFX8-NEXT: .LBB7_113: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_114: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 38 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_116 +; GFX8-NEXT: ; %bb.115: +; GFX8-NEXT: v_writelane_b32 v1, s6, 38 +; GFX8-NEXT: s_branch .LBB7_117 +; GFX8-NEXT: .LBB7_116: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_117: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 39 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_119 +; GFX8-NEXT: ; %bb.118: +; GFX8-NEXT: v_writelane_b32 v1, s6, 39 +; GFX8-NEXT: s_branch .LBB7_120 +; GFX8-NEXT: .LBB7_119: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_120: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 40 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_122 +; GFX8-NEXT: ; %bb.121: +; GFX8-NEXT: v_writelane_b32 v1, s6, 40 +; GFX8-NEXT: s_branch .LBB7_123 +; GFX8-NEXT: .LBB7_122: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_123: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 41 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_125 +; GFX8-NEXT: ; %bb.124: +; GFX8-NEXT: v_writelane_b32 v1, s6, 41 +; GFX8-NEXT: s_branch .LBB7_126 +; GFX8-NEXT: .LBB7_125: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_126: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 42 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_128 +; GFX8-NEXT: ; %bb.127: +; GFX8-NEXT: v_writelane_b32 v1, s6, 42 +; GFX8-NEXT: s_branch .LBB7_129 +; GFX8-NEXT: .LBB7_128: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_129: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 43 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_131 +; GFX8-NEXT: ; %bb.130: +; GFX8-NEXT: v_writelane_b32 v1, s6, 43 +; GFX8-NEXT: s_branch .LBB7_132 +; GFX8-NEXT: .LBB7_131: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_132: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 44 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_134 +; GFX8-NEXT: ; %bb.133: +; GFX8-NEXT: v_writelane_b32 v1, s6, 44 +; GFX8-NEXT: s_branch .LBB7_135 +; GFX8-NEXT: .LBB7_134: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_135: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 45 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_137 +; GFX8-NEXT: ; %bb.136: +; GFX8-NEXT: v_writelane_b32 v1, s6, 45 +; GFX8-NEXT: s_branch .LBB7_138 +; GFX8-NEXT: .LBB7_137: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_138: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 46 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_140 +; GFX8-NEXT: ; %bb.139: +; GFX8-NEXT: v_writelane_b32 v1, s6, 46 +; GFX8-NEXT: s_branch .LBB7_141 +; GFX8-NEXT: .LBB7_140: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_141: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 47 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_143 +; GFX8-NEXT: ; %bb.142: +; GFX8-NEXT: v_writelane_b32 v1, s6, 47 +; GFX8-NEXT: s_branch .LBB7_144 +; GFX8-NEXT: .LBB7_143: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_144: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 48 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_146 +; GFX8-NEXT: ; %bb.145: +; GFX8-NEXT: v_writelane_b32 v1, s6, 48 +; GFX8-NEXT: s_branch .LBB7_147 +; GFX8-NEXT: .LBB7_146: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_147: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 49 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_149 +; GFX8-NEXT: ; %bb.148: +; GFX8-NEXT: v_writelane_b32 v1, s6, 49 +; GFX8-NEXT: s_branch .LBB7_150 +; GFX8-NEXT: .LBB7_149: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_150: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 50 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_152 +; GFX8-NEXT: ; %bb.151: +; GFX8-NEXT: v_writelane_b32 v1, s6, 50 +; GFX8-NEXT: s_branch .LBB7_153 +; GFX8-NEXT: .LBB7_152: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_153: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 51 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_155 +; GFX8-NEXT: ; %bb.154: +; GFX8-NEXT: v_writelane_b32 v1, s6, 51 +; GFX8-NEXT: s_branch .LBB7_156 +; GFX8-NEXT: .LBB7_155: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_156: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 52 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_158 +; GFX8-NEXT: ; %bb.157: +; GFX8-NEXT: v_writelane_b32 v1, s6, 52 +; GFX8-NEXT: s_branch .LBB7_159 +; GFX8-NEXT: .LBB7_158: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_159: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 53 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_161 +; GFX8-NEXT: ; %bb.160: +; GFX8-NEXT: v_writelane_b32 v1, s6, 53 +; GFX8-NEXT: s_branch .LBB7_162 +; GFX8-NEXT: .LBB7_161: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_162: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 54 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_164 +; GFX8-NEXT: ; %bb.163: +; GFX8-NEXT: v_writelane_b32 v1, s6, 54 +; GFX8-NEXT: s_branch .LBB7_165 +; GFX8-NEXT: .LBB7_164: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_165: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 55 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_167 +; GFX8-NEXT: ; %bb.166: +; GFX8-NEXT: v_writelane_b32 v1, s6, 55 +; GFX8-NEXT: s_branch .LBB7_168 +; GFX8-NEXT: .LBB7_167: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_168: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 56 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_170 +; GFX8-NEXT: ; %bb.169: +; GFX8-NEXT: v_writelane_b32 v1, s6, 56 +; GFX8-NEXT: s_branch .LBB7_171 +; GFX8-NEXT: .LBB7_170: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_171: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 57 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_173 +; GFX8-NEXT: ; %bb.172: +; GFX8-NEXT: v_writelane_b32 v1, s6, 57 +; GFX8-NEXT: s_branch .LBB7_174 +; GFX8-NEXT: .LBB7_173: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_174: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 58 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_176 +; GFX8-NEXT: ; %bb.175: +; GFX8-NEXT: v_writelane_b32 v1, s6, 58 +; GFX8-NEXT: s_branch .LBB7_177 +; GFX8-NEXT: .LBB7_176: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_177: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 59 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_179 +; GFX8-NEXT: ; %bb.178: +; GFX8-NEXT: v_writelane_b32 v1, s6, 59 +; GFX8-NEXT: s_branch .LBB7_180 +; GFX8-NEXT: .LBB7_179: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_180: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 60 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_182 +; GFX8-NEXT: ; %bb.181: +; GFX8-NEXT: v_writelane_b32 v1, s6, 60 +; GFX8-NEXT: s_branch .LBB7_183 +; GFX8-NEXT: .LBB7_182: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_183: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 61 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_185 +; GFX8-NEXT: ; %bb.184: +; GFX8-NEXT: v_writelane_b32 v1, s6, 61 +; GFX8-NEXT: s_branch .LBB7_186 +; GFX8-NEXT: .LBB7_185: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_186: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s3, v0, 62 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_188 +; GFX8-NEXT: ; %bb.187: +; GFX8-NEXT: v_writelane_b32 v1, s6, 62 +; GFX8-NEXT: s_branch .LBB7_189 +; GFX8-NEXT: .LBB7_188: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_189: +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s3, s3, 0 +; GFX8-NEXT: s_add_i32 s6, s6, s3 +; GFX8-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX8-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX8-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX8-NEXT: v_readlane_b32 s7, v0, 63 +; GFX8-NEXT: s_cbranch_scc1 .LBB7_191 +; GFX8-NEXT: ; %bb.190: +; GFX8-NEXT: v_writelane_b32 v1, s6, 63 +; GFX8-NEXT: s_branch .LBB7_192 +; GFX8-NEXT: .LBB7_191: +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: .LBB7_192: +; GFX8-NEXT: v_readfirstlane_b32 s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz .LBB7_2 -; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_cbranch_execz .LBB7_194 +; GFX8-NEXT: ; %bb.193: ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX8-NEXT: s_cselect_b32 s4, s7, 0 +; GFX8-NEXT: s_add_i32 s4, s6, s4 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc -; GFX8-NEXT: .LBB7_2: +; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX8-NEXT: .LBB7_194: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v2 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_mov_b32_e32 v3, s0 -; GFX8-NEXT: flat_store_dword v[3:4], v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 1 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX9-NEXT: s_mov_b32 s3, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 0 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v1, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_writelane_b32 v1, 0, 0 +; GFX9-NEXT: s_branch .LBB7_3 +; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_3: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s6, s2, 0 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 1 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX9-NEXT: ; %bb.4: +; GFX9-NEXT: v_writelane_b32 v1, s6, 1 +; GFX9-NEXT: s_branch .LBB7_6 +; GFX9-NEXT: .LBB7_5: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_6: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 2 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX9-NEXT: ; %bb.7: +; GFX9-NEXT: v_writelane_b32 v1, s6, 2 +; GFX9-NEXT: s_branch .LBB7_9 +; GFX9-NEXT: .LBB7_8: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_9: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 3 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX9-NEXT: ; %bb.10: +; GFX9-NEXT: v_writelane_b32 v1, s6, 3 +; GFX9-NEXT: s_branch .LBB7_12 +; GFX9-NEXT: .LBB7_11: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_12: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 4 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX9-NEXT: ; %bb.13: +; GFX9-NEXT: v_writelane_b32 v1, s6, 4 +; GFX9-NEXT: s_branch .LBB7_15 +; GFX9-NEXT: .LBB7_14: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_15: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 5 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX9-NEXT: ; %bb.16: +; GFX9-NEXT: v_writelane_b32 v1, s6, 5 +; GFX9-NEXT: s_branch .LBB7_18 +; GFX9-NEXT: .LBB7_17: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_18: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 6 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX9-NEXT: ; %bb.19: +; GFX9-NEXT: v_writelane_b32 v1, s6, 6 +; GFX9-NEXT: s_branch .LBB7_21 +; GFX9-NEXT: .LBB7_20: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_21: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 7 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX9-NEXT: ; %bb.22: +; GFX9-NEXT: v_writelane_b32 v1, s6, 7 +; GFX9-NEXT: s_branch .LBB7_24 +; GFX9-NEXT: .LBB7_23: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_24: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 8 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX9-NEXT: ; %bb.25: +; GFX9-NEXT: v_writelane_b32 v1, s6, 8 +; GFX9-NEXT: s_branch .LBB7_27 +; GFX9-NEXT: .LBB7_26: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_27: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 9 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX9-NEXT: ; %bb.28: +; GFX9-NEXT: v_writelane_b32 v1, s6, 9 +; GFX9-NEXT: s_branch .LBB7_30 +; GFX9-NEXT: .LBB7_29: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_30: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 10 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX9-NEXT: ; %bb.31: +; GFX9-NEXT: v_writelane_b32 v1, s6, 10 +; GFX9-NEXT: s_branch .LBB7_33 +; GFX9-NEXT: .LBB7_32: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_33: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 11 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX9-NEXT: ; %bb.34: +; GFX9-NEXT: v_writelane_b32 v1, s6, 11 +; GFX9-NEXT: s_branch .LBB7_36 +; GFX9-NEXT: .LBB7_35: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_36: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 12 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX9-NEXT: ; %bb.37: +; GFX9-NEXT: v_writelane_b32 v1, s6, 12 +; GFX9-NEXT: s_branch .LBB7_39 +; GFX9-NEXT: .LBB7_38: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_39: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 13 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX9-NEXT: ; %bb.40: +; GFX9-NEXT: v_writelane_b32 v1, s6, 13 +; GFX9-NEXT: s_branch .LBB7_42 +; GFX9-NEXT: .LBB7_41: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_42: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 14 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX9-NEXT: ; %bb.43: +; GFX9-NEXT: v_writelane_b32 v1, s6, 14 +; GFX9-NEXT: s_branch .LBB7_45 +; GFX9-NEXT: .LBB7_44: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_45: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 15 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX9-NEXT: ; %bb.46: +; GFX9-NEXT: v_writelane_b32 v1, s6, 15 +; GFX9-NEXT: s_branch .LBB7_48 +; GFX9-NEXT: .LBB7_47: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_48: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 16 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX9-NEXT: ; %bb.49: +; GFX9-NEXT: v_writelane_b32 v1, s6, 16 +; GFX9-NEXT: s_branch .LBB7_51 +; GFX9-NEXT: .LBB7_50: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_51: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 17 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX9-NEXT: ; %bb.52: +; GFX9-NEXT: v_writelane_b32 v1, s6, 17 +; GFX9-NEXT: s_branch .LBB7_54 +; GFX9-NEXT: .LBB7_53: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_54: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 18 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX9-NEXT: ; %bb.55: +; GFX9-NEXT: v_writelane_b32 v1, s6, 18 +; GFX9-NEXT: s_branch .LBB7_57 +; GFX9-NEXT: .LBB7_56: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_57: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 19 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX9-NEXT: ; %bb.58: +; GFX9-NEXT: v_writelane_b32 v1, s6, 19 +; GFX9-NEXT: s_branch .LBB7_60 +; GFX9-NEXT: .LBB7_59: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_60: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 20 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX9-NEXT: ; %bb.61: +; GFX9-NEXT: v_writelane_b32 v1, s6, 20 +; GFX9-NEXT: s_branch .LBB7_63 +; GFX9-NEXT: .LBB7_62: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_63: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 21 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX9-NEXT: ; %bb.64: +; GFX9-NEXT: v_writelane_b32 v1, s6, 21 +; GFX9-NEXT: s_branch .LBB7_66 +; GFX9-NEXT: .LBB7_65: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_66: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 22 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX9-NEXT: ; %bb.67: +; GFX9-NEXT: v_writelane_b32 v1, s6, 22 +; GFX9-NEXT: s_branch .LBB7_69 +; GFX9-NEXT: .LBB7_68: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_69: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 23 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX9-NEXT: ; %bb.70: +; GFX9-NEXT: v_writelane_b32 v1, s6, 23 +; GFX9-NEXT: s_branch .LBB7_72 +; GFX9-NEXT: .LBB7_71: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_72: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 24 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX9-NEXT: ; %bb.73: +; GFX9-NEXT: v_writelane_b32 v1, s6, 24 +; GFX9-NEXT: s_branch .LBB7_75 +; GFX9-NEXT: .LBB7_74: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_75: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 25 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX9-NEXT: ; %bb.76: +; GFX9-NEXT: v_writelane_b32 v1, s6, 25 +; GFX9-NEXT: s_branch .LBB7_78 +; GFX9-NEXT: .LBB7_77: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_78: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 26 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX9-NEXT: ; %bb.79: +; GFX9-NEXT: v_writelane_b32 v1, s6, 26 +; GFX9-NEXT: s_branch .LBB7_81 +; GFX9-NEXT: .LBB7_80: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_81: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 27 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX9-NEXT: ; %bb.82: +; GFX9-NEXT: v_writelane_b32 v1, s6, 27 +; GFX9-NEXT: s_branch .LBB7_84 +; GFX9-NEXT: .LBB7_83: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_84: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 28 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX9-NEXT: ; %bb.85: +; GFX9-NEXT: v_writelane_b32 v1, s6, 28 +; GFX9-NEXT: s_branch .LBB7_87 +; GFX9-NEXT: .LBB7_86: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_87: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s2 +; GFX9-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s2, v0, 29 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX9-NEXT: ; %bb.88: +; GFX9-NEXT: v_writelane_b32 v1, s6, 29 +; GFX9-NEXT: s_branch .LBB7_90 +; GFX9-NEXT: .LBB7_89: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_90: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s2 +; GFX9-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX9-NEXT: s_mov_b32 s7, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX9-NEXT: v_readlane_b32 s5, v0, 30 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX9-NEXT: ; %bb.91: +; GFX9-NEXT: v_writelane_b32 v1, s4, 30 +; GFX9-NEXT: s_branch .LBB7_93 +; GFX9-NEXT: .LBB7_92: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_93: +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_add_i32 s6, s4, s2 +; GFX9-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX9-NEXT: v_readlane_b32 s3, v0, 31 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX9-NEXT: ; %bb.94: +; GFX9-NEXT: v_writelane_b32 v1, s6, 31 +; GFX9-NEXT: s_branch .LBB7_96 +; GFX9-NEXT: .LBB7_95: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_96: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX9-NEXT: s_mov_b32 s2, 0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 32 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_98 +; GFX9-NEXT: ; %bb.97: +; GFX9-NEXT: v_writelane_b32 v1, s6, 32 +; GFX9-NEXT: s_branch .LBB7_99 +; GFX9-NEXT: .LBB7_98: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_99: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 33 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_101 +; GFX9-NEXT: ; %bb.100: +; GFX9-NEXT: v_writelane_b32 v1, s6, 33 +; GFX9-NEXT: s_branch .LBB7_102 +; GFX9-NEXT: .LBB7_101: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_102: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 34 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_104 +; GFX9-NEXT: ; %bb.103: +; GFX9-NEXT: v_writelane_b32 v1, s6, 34 +; GFX9-NEXT: s_branch .LBB7_105 +; GFX9-NEXT: .LBB7_104: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_105: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 35 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_107 +; GFX9-NEXT: ; %bb.106: +; GFX9-NEXT: v_writelane_b32 v1, s6, 35 +; GFX9-NEXT: s_branch .LBB7_108 +; GFX9-NEXT: .LBB7_107: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_108: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 36 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_110 +; GFX9-NEXT: ; %bb.109: +; GFX9-NEXT: v_writelane_b32 v1, s6, 36 +; GFX9-NEXT: s_branch .LBB7_111 +; GFX9-NEXT: .LBB7_110: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_111: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 37 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_113 +; GFX9-NEXT: ; %bb.112: +; GFX9-NEXT: v_writelane_b32 v1, s6, 37 +; GFX9-NEXT: s_branch .LBB7_114 +; GFX9-NEXT: .LBB7_113: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_114: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 38 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_116 +; GFX9-NEXT: ; %bb.115: +; GFX9-NEXT: v_writelane_b32 v1, s6, 38 +; GFX9-NEXT: s_branch .LBB7_117 +; GFX9-NEXT: .LBB7_116: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_117: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 39 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_119 +; GFX9-NEXT: ; %bb.118: +; GFX9-NEXT: v_writelane_b32 v1, s6, 39 +; GFX9-NEXT: s_branch .LBB7_120 +; GFX9-NEXT: .LBB7_119: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_120: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 40 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_122 +; GFX9-NEXT: ; %bb.121: +; GFX9-NEXT: v_writelane_b32 v1, s6, 40 +; GFX9-NEXT: s_branch .LBB7_123 +; GFX9-NEXT: .LBB7_122: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_123: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 41 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_125 +; GFX9-NEXT: ; %bb.124: +; GFX9-NEXT: v_writelane_b32 v1, s6, 41 +; GFX9-NEXT: s_branch .LBB7_126 +; GFX9-NEXT: .LBB7_125: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_126: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 42 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_128 +; GFX9-NEXT: ; %bb.127: +; GFX9-NEXT: v_writelane_b32 v1, s6, 42 +; GFX9-NEXT: s_branch .LBB7_129 +; GFX9-NEXT: .LBB7_128: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_129: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 43 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_131 +; GFX9-NEXT: ; %bb.130: +; GFX9-NEXT: v_writelane_b32 v1, s6, 43 +; GFX9-NEXT: s_branch .LBB7_132 +; GFX9-NEXT: .LBB7_131: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_132: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 44 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_134 +; GFX9-NEXT: ; %bb.133: +; GFX9-NEXT: v_writelane_b32 v1, s6, 44 +; GFX9-NEXT: s_branch .LBB7_135 +; GFX9-NEXT: .LBB7_134: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_135: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 45 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_137 +; GFX9-NEXT: ; %bb.136: +; GFX9-NEXT: v_writelane_b32 v1, s6, 45 +; GFX9-NEXT: s_branch .LBB7_138 +; GFX9-NEXT: .LBB7_137: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_138: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 46 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_140 +; GFX9-NEXT: ; %bb.139: +; GFX9-NEXT: v_writelane_b32 v1, s6, 46 +; GFX9-NEXT: s_branch .LBB7_141 +; GFX9-NEXT: .LBB7_140: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_141: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 47 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_143 +; GFX9-NEXT: ; %bb.142: +; GFX9-NEXT: v_writelane_b32 v1, s6, 47 +; GFX9-NEXT: s_branch .LBB7_144 +; GFX9-NEXT: .LBB7_143: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_144: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 48 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_146 +; GFX9-NEXT: ; %bb.145: +; GFX9-NEXT: v_writelane_b32 v1, s6, 48 +; GFX9-NEXT: s_branch .LBB7_147 +; GFX9-NEXT: .LBB7_146: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_147: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 49 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_149 +; GFX9-NEXT: ; %bb.148: +; GFX9-NEXT: v_writelane_b32 v1, s6, 49 +; GFX9-NEXT: s_branch .LBB7_150 +; GFX9-NEXT: .LBB7_149: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_150: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 50 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_152 +; GFX9-NEXT: ; %bb.151: +; GFX9-NEXT: v_writelane_b32 v1, s6, 50 +; GFX9-NEXT: s_branch .LBB7_153 +; GFX9-NEXT: .LBB7_152: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_153: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 51 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_155 +; GFX9-NEXT: ; %bb.154: +; GFX9-NEXT: v_writelane_b32 v1, s6, 51 +; GFX9-NEXT: s_branch .LBB7_156 +; GFX9-NEXT: .LBB7_155: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_156: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 52 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_158 +; GFX9-NEXT: ; %bb.157: +; GFX9-NEXT: v_writelane_b32 v1, s6, 52 +; GFX9-NEXT: s_branch .LBB7_159 +; GFX9-NEXT: .LBB7_158: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_159: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 53 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_161 +; GFX9-NEXT: ; %bb.160: +; GFX9-NEXT: v_writelane_b32 v1, s6, 53 +; GFX9-NEXT: s_branch .LBB7_162 +; GFX9-NEXT: .LBB7_161: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_162: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 54 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_164 +; GFX9-NEXT: ; %bb.163: +; GFX9-NEXT: v_writelane_b32 v1, s6, 54 +; GFX9-NEXT: s_branch .LBB7_165 +; GFX9-NEXT: .LBB7_164: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_165: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 55 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_167 +; GFX9-NEXT: ; %bb.166: +; GFX9-NEXT: v_writelane_b32 v1, s6, 55 +; GFX9-NEXT: s_branch .LBB7_168 +; GFX9-NEXT: .LBB7_167: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_168: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 56 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_170 +; GFX9-NEXT: ; %bb.169: +; GFX9-NEXT: v_writelane_b32 v1, s6, 56 +; GFX9-NEXT: s_branch .LBB7_171 +; GFX9-NEXT: .LBB7_170: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_171: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 57 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_173 +; GFX9-NEXT: ; %bb.172: +; GFX9-NEXT: v_writelane_b32 v1, s6, 57 +; GFX9-NEXT: s_branch .LBB7_174 +; GFX9-NEXT: .LBB7_173: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_174: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 58 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_176 +; GFX9-NEXT: ; %bb.175: +; GFX9-NEXT: v_writelane_b32 v1, s6, 58 +; GFX9-NEXT: s_branch .LBB7_177 +; GFX9-NEXT: .LBB7_176: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_177: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 59 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_179 +; GFX9-NEXT: ; %bb.178: +; GFX9-NEXT: v_writelane_b32 v1, s6, 59 +; GFX9-NEXT: s_branch .LBB7_180 +; GFX9-NEXT: .LBB7_179: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_180: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 60 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_182 +; GFX9-NEXT: ; %bb.181: +; GFX9-NEXT: v_writelane_b32 v1, s6, 60 +; GFX9-NEXT: s_branch .LBB7_183 +; GFX9-NEXT: .LBB7_182: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_183: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 61 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_185 +; GFX9-NEXT: ; %bb.184: +; GFX9-NEXT: v_writelane_b32 v1, s6, 61 +; GFX9-NEXT: s_branch .LBB7_186 +; GFX9-NEXT: .LBB7_185: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_186: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s3, v0, 62 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_188 +; GFX9-NEXT: ; %bb.187: +; GFX9-NEXT: v_writelane_b32 v1, s6, 62 +; GFX9-NEXT: s_branch .LBB7_189 +; GFX9-NEXT: .LBB7_188: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_189: +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX9-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX9-NEXT: v_readlane_b32 s7, v0, 63 +; GFX9-NEXT: s_cbranch_scc1 .LBB7_191 +; GFX9-NEXT: ; %bb.190: +; GFX9-NEXT: v_writelane_b32 v1, s6, 63 +; GFX9-NEXT: s_branch .LBB7_192 +; GFX9-NEXT: .LBB7_191: +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: .LBB7_192: +; GFX9-NEXT: v_readfirstlane_b32 s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz .LBB7_2 -; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_cbranch_execz .LBB7_194 +; GFX9-NEXT: ; %bb.193: ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_add_i32 s4, s6, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc -; GFX9-NEXT: .LBB7_2: +; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX9-NEXT: .LBB7_194: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v3, v0, s[0:1] +; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry +; GFX10W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX10W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX10W64-NEXT: ; %bb.1: ; GFX10W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: s_not_b64 exec, exec -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX10W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX10W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX10W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX10W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX10W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX10W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX10W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX10W64-NEXT: ; implicit-def: $vgpr4 +; GFX10W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W64-NEXT: s_branch .LBB7_3 +; GFX10W64-NEXT: .LBB7_2: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_3: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX10W64-NEXT: ; %bb.4: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX10W64-NEXT: s_branch .LBB7_6 +; GFX10W64-NEXT: .LBB7_5: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_6: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX10W64-NEXT: ; %bb.7: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX10W64-NEXT: s_branch .LBB7_9 +; GFX10W64-NEXT: .LBB7_8: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_9: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX10W64-NEXT: ; %bb.10: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX10W64-NEXT: s_branch .LBB7_12 +; GFX10W64-NEXT: .LBB7_11: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_12: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX10W64-NEXT: ; %bb.13: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX10W64-NEXT: s_branch .LBB7_15 +; GFX10W64-NEXT: .LBB7_14: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_15: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX10W64-NEXT: ; %bb.16: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX10W64-NEXT: s_branch .LBB7_18 +; GFX10W64-NEXT: .LBB7_17: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_18: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX10W64-NEXT: ; %bb.19: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX10W64-NEXT: s_branch .LBB7_21 +; GFX10W64-NEXT: .LBB7_20: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_21: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX10W64-NEXT: ; %bb.22: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX10W64-NEXT: s_branch .LBB7_24 +; GFX10W64-NEXT: .LBB7_23: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_24: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX10W64-NEXT: ; %bb.25: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX10W64-NEXT: s_branch .LBB7_27 +; GFX10W64-NEXT: .LBB7_26: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_27: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX10W64-NEXT: ; %bb.28: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX10W64-NEXT: s_branch .LBB7_30 +; GFX10W64-NEXT: .LBB7_29: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_30: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX10W64-NEXT: ; %bb.31: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX10W64-NEXT: s_branch .LBB7_33 +; GFX10W64-NEXT: .LBB7_32: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_33: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX10W64-NEXT: ; %bb.34: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX10W64-NEXT: s_branch .LBB7_36 +; GFX10W64-NEXT: .LBB7_35: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_36: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX10W64-NEXT: ; %bb.37: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX10W64-NEXT: s_branch .LBB7_39 +; GFX10W64-NEXT: .LBB7_38: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_39: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX10W64-NEXT: ; %bb.40: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX10W64-NEXT: s_branch .LBB7_42 +; GFX10W64-NEXT: .LBB7_41: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_42: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX10W64-NEXT: ; %bb.43: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX10W64-NEXT: s_branch .LBB7_45 +; GFX10W64-NEXT: .LBB7_44: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_45: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX10W64-NEXT: ; %bb.46: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX10W64-NEXT: s_branch .LBB7_48 +; GFX10W64-NEXT: .LBB7_47: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_48: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX10W64-NEXT: ; %bb.49: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX10W64-NEXT: s_branch .LBB7_51 +; GFX10W64-NEXT: .LBB7_50: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_51: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX10W64-NEXT: ; %bb.52: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX10W64-NEXT: s_branch .LBB7_54 +; GFX10W64-NEXT: .LBB7_53: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_54: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX10W64-NEXT: ; %bb.55: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX10W64-NEXT: s_branch .LBB7_57 +; GFX10W64-NEXT: .LBB7_56: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_57: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX10W64-NEXT: ; %bb.58: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX10W64-NEXT: s_branch .LBB7_60 +; GFX10W64-NEXT: .LBB7_59: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_60: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX10W64-NEXT: ; %bb.61: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX10W64-NEXT: s_branch .LBB7_63 +; GFX10W64-NEXT: .LBB7_62: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_63: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX10W64-NEXT: ; %bb.64: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX10W64-NEXT: s_branch .LBB7_66 +; GFX10W64-NEXT: .LBB7_65: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_66: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX10W64-NEXT: ; %bb.67: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX10W64-NEXT: s_branch .LBB7_69 +; GFX10W64-NEXT: .LBB7_68: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_69: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX10W64-NEXT: ; %bb.70: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX10W64-NEXT: s_branch .LBB7_72 +; GFX10W64-NEXT: .LBB7_71: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_72: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX10W64-NEXT: ; %bb.73: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX10W64-NEXT: s_branch .LBB7_75 +; GFX10W64-NEXT: .LBB7_74: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_75: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX10W64-NEXT: ; %bb.76: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX10W64-NEXT: s_branch .LBB7_78 +; GFX10W64-NEXT: .LBB7_77: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_78: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX10W64-NEXT: ; %bb.79: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX10W64-NEXT: s_branch .LBB7_81 +; GFX10W64-NEXT: .LBB7_80: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_81: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX10W64-NEXT: ; %bb.82: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX10W64-NEXT: s_branch .LBB7_84 +; GFX10W64-NEXT: .LBB7_83: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_84: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX10W64-NEXT: s_add_i32 s6, s6, s2 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s3, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX10W64-NEXT: ; %bb.85: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX10W64-NEXT: s_branch .LBB7_87 +; GFX10W64-NEXT: .LBB7_86: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_87: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX10W64-NEXT: ; %bb.88: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX10W64-NEXT: s_branch .LBB7_90 +; GFX10W64-NEXT: .LBB7_89: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_90: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX10W64-NEXT: s_add_i32 s4, s6, s2 +; GFX10W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX10W64-NEXT: s_mov_b32 s7, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX10W64-NEXT: ; %bb.91: +; GFX10W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX10W64-NEXT: s_branch .LBB7_93 +; GFX10W64-NEXT: .LBB7_92: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_93: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s2, s5, 0 +; GFX10W64-NEXT: s_add_i32 s6, s4, s2 +; GFX10W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX10W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX10W64-NEXT: ; %bb.94: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX10W64-NEXT: s_branch .LBB7_96 +; GFX10W64-NEXT: .LBB7_95: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_96: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_98 +; GFX10W64-NEXT: ; %bb.97: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX10W64-NEXT: s_branch .LBB7_99 +; GFX10W64-NEXT: .LBB7_98: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_99: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_101 +; GFX10W64-NEXT: ; %bb.100: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX10W64-NEXT: s_branch .LBB7_102 +; GFX10W64-NEXT: .LBB7_101: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_102: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_104 +; GFX10W64-NEXT: ; %bb.103: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX10W64-NEXT: s_branch .LBB7_105 +; GFX10W64-NEXT: .LBB7_104: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_105: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_107 +; GFX10W64-NEXT: ; %bb.106: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX10W64-NEXT: s_branch .LBB7_108 +; GFX10W64-NEXT: .LBB7_107: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_108: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_110 +; GFX10W64-NEXT: ; %bb.109: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX10W64-NEXT: s_branch .LBB7_111 +; GFX10W64-NEXT: .LBB7_110: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_111: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_113 +; GFX10W64-NEXT: ; %bb.112: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX10W64-NEXT: s_branch .LBB7_114 +; GFX10W64-NEXT: .LBB7_113: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_114: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_116 +; GFX10W64-NEXT: ; %bb.115: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX10W64-NEXT: s_branch .LBB7_117 +; GFX10W64-NEXT: .LBB7_116: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_117: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_119 +; GFX10W64-NEXT: ; %bb.118: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX10W64-NEXT: s_branch .LBB7_120 +; GFX10W64-NEXT: .LBB7_119: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_120: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_122 +; GFX10W64-NEXT: ; %bb.121: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX10W64-NEXT: s_branch .LBB7_123 +; GFX10W64-NEXT: .LBB7_122: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_123: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_125 +; GFX10W64-NEXT: ; %bb.124: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX10W64-NEXT: s_branch .LBB7_126 +; GFX10W64-NEXT: .LBB7_125: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_126: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_128 +; GFX10W64-NEXT: ; %bb.127: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX10W64-NEXT: s_branch .LBB7_129 +; GFX10W64-NEXT: .LBB7_128: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_129: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_131 +; GFX10W64-NEXT: ; %bb.130: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX10W64-NEXT: s_branch .LBB7_132 +; GFX10W64-NEXT: .LBB7_131: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_132: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_134 +; GFX10W64-NEXT: ; %bb.133: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX10W64-NEXT: s_branch .LBB7_135 +; GFX10W64-NEXT: .LBB7_134: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_135: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_137 +; GFX10W64-NEXT: ; %bb.136: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX10W64-NEXT: s_branch .LBB7_138 +; GFX10W64-NEXT: .LBB7_137: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_138: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_140 +; GFX10W64-NEXT: ; %bb.139: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX10W64-NEXT: s_branch .LBB7_141 +; GFX10W64-NEXT: .LBB7_140: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_141: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_143 +; GFX10W64-NEXT: ; %bb.142: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX10W64-NEXT: s_branch .LBB7_144 +; GFX10W64-NEXT: .LBB7_143: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_144: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_146 +; GFX10W64-NEXT: ; %bb.145: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX10W64-NEXT: s_branch .LBB7_147 +; GFX10W64-NEXT: .LBB7_146: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_147: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_149 +; GFX10W64-NEXT: ; %bb.148: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX10W64-NEXT: s_branch .LBB7_150 +; GFX10W64-NEXT: .LBB7_149: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_150: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_152 +; GFX10W64-NEXT: ; %bb.151: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX10W64-NEXT: s_branch .LBB7_153 +; GFX10W64-NEXT: .LBB7_152: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_153: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_155 +; GFX10W64-NEXT: ; %bb.154: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX10W64-NEXT: s_branch .LBB7_156 +; GFX10W64-NEXT: .LBB7_155: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_156: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_158 +; GFX10W64-NEXT: ; %bb.157: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX10W64-NEXT: s_branch .LBB7_159 +; GFX10W64-NEXT: .LBB7_158: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_159: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_161 +; GFX10W64-NEXT: ; %bb.160: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX10W64-NEXT: s_branch .LBB7_162 +; GFX10W64-NEXT: .LBB7_161: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_162: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_164 +; GFX10W64-NEXT: ; %bb.163: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX10W64-NEXT: s_branch .LBB7_165 +; GFX10W64-NEXT: .LBB7_164: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_165: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_167 +; GFX10W64-NEXT: ; %bb.166: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX10W64-NEXT: s_branch .LBB7_168 +; GFX10W64-NEXT: .LBB7_167: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_168: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_170 +; GFX10W64-NEXT: ; %bb.169: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX10W64-NEXT: s_branch .LBB7_171 +; GFX10W64-NEXT: .LBB7_170: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_171: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_173 +; GFX10W64-NEXT: ; %bb.172: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX10W64-NEXT: s_branch .LBB7_174 +; GFX10W64-NEXT: .LBB7_173: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_174: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_176 +; GFX10W64-NEXT: ; %bb.175: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX10W64-NEXT: s_branch .LBB7_177 +; GFX10W64-NEXT: .LBB7_176: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_177: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_179 +; GFX10W64-NEXT: ; %bb.178: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX10W64-NEXT: s_branch .LBB7_180 +; GFX10W64-NEXT: .LBB7_179: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_180: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_182 +; GFX10W64-NEXT: ; %bb.181: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX10W64-NEXT: s_branch .LBB7_183 +; GFX10W64-NEXT: .LBB7_182: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_183: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX10W64-NEXT: s_add_i32 s6, s6, s4 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_185 +; GFX10W64-NEXT: ; %bb.184: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX10W64-NEXT: s_branch .LBB7_186 +; GFX10W64-NEXT: .LBB7_185: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_186: +; GFX10W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX10W64-NEXT: s_mov_b32 s2, 0 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_188 +; GFX10W64-NEXT: ; %bb.187: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX10W64-NEXT: s_branch .LBB7_189 +; GFX10W64-NEXT: .LBB7_188: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_189: +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX10W64-NEXT: s_add_i32 s6, s6, s3 +; GFX10W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX10W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX10W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX10W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX10W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_191 +; GFX10W64-NEXT: ; %bb.190: +; GFX10W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX10W64-NEXT: s_branch .LBB7_192 +; GFX10W64-NEXT: .LBB7_191: +; GFX10W64-NEXT: ; implicit-def: $vgpr1 +; GFX10W64-NEXT: .LBB7_192: +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10W64-NEXT: ; implicit-def: $vgpr0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX10W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX10W64-NEXT: ; %bb.1: +; GFX10W64-NEXT: s_cbranch_execz .LBB7_194 +; GFX10W64-NEXT: ; %bb.193: ; GFX10W64-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 -; GFX10W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX10W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX10W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX10W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX10W64-NEXT: s_add_i32 s4, s6, s4 +; GFX10W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: buffer_atomic_sub v4, v0, s[8:11], 0 idxen glc -; GFX10W64-NEXT: .LBB7_2: +; GFX10W64-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX10W64-NEXT: .LBB7_194: ; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W64-NEXT: s_waitcnt vmcnt(0) -; GFX10W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W64-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W64-NEXT: s_endpgm ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX10W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX10W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX10W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX10W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX10W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX10W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX10W32-NEXT: ; implicit-def: $vgpr4 -; GFX10W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10W32-NEXT: s_cbranch_execz .LBB7_2 +; GFX10W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX10W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX10W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_2 ; GFX10W32-NEXT: ; %bb.1: -; GFX10W32-NEXT: s_mov_b32 s3, s4 -; GFX10W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 -; GFX10W32-NEXT: v_mov_b32_e32 v4, s3 -; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: buffer_atomic_sub v4, v0, s[4:7], 0 idxen glc +; GFX10W32-NEXT: v_mov_b32_e32 v1, v0 +; GFX10W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX10W32-NEXT: s_branch .LBB7_3 ; GFX10W32-NEXT: .LBB7_2: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_3: +; GFX10W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX10W32-NEXT: ; %bb.4: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX10W32-NEXT: s_branch .LBB7_6 +; GFX10W32-NEXT: .LBB7_5: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_6: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX10W32-NEXT: ; %bb.7: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX10W32-NEXT: s_branch .LBB7_9 +; GFX10W32-NEXT: .LBB7_8: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_9: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX10W32-NEXT: ; %bb.10: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX10W32-NEXT: s_branch .LBB7_12 +; GFX10W32-NEXT: .LBB7_11: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_12: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX10W32-NEXT: ; %bb.13: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX10W32-NEXT: s_branch .LBB7_15 +; GFX10W32-NEXT: .LBB7_14: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_15: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX10W32-NEXT: ; %bb.16: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX10W32-NEXT: s_branch .LBB7_18 +; GFX10W32-NEXT: .LBB7_17: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_18: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX10W32-NEXT: ; %bb.19: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX10W32-NEXT: s_branch .LBB7_21 +; GFX10W32-NEXT: .LBB7_20: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_21: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX10W32-NEXT: ; %bb.22: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX10W32-NEXT: s_branch .LBB7_24 +; GFX10W32-NEXT: .LBB7_23: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_24: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX10W32-NEXT: ; %bb.25: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX10W32-NEXT: s_branch .LBB7_27 +; GFX10W32-NEXT: .LBB7_26: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_27: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX10W32-NEXT: ; %bb.28: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX10W32-NEXT: s_branch .LBB7_30 +; GFX10W32-NEXT: .LBB7_29: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_30: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX10W32-NEXT: ; %bb.31: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX10W32-NEXT: s_branch .LBB7_33 +; GFX10W32-NEXT: .LBB7_32: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_33: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX10W32-NEXT: ; %bb.34: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX10W32-NEXT: s_branch .LBB7_36 +; GFX10W32-NEXT: .LBB7_35: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_36: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX10W32-NEXT: ; %bb.37: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX10W32-NEXT: s_branch .LBB7_39 +; GFX10W32-NEXT: .LBB7_38: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_39: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX10W32-NEXT: ; %bb.40: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX10W32-NEXT: s_branch .LBB7_42 +; GFX10W32-NEXT: .LBB7_41: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_42: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX10W32-NEXT: ; %bb.43: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX10W32-NEXT: s_branch .LBB7_45 +; GFX10W32-NEXT: .LBB7_44: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_45: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX10W32-NEXT: ; %bb.46: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX10W32-NEXT: s_branch .LBB7_48 +; GFX10W32-NEXT: .LBB7_47: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_48: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX10W32-NEXT: ; %bb.49: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX10W32-NEXT: s_branch .LBB7_51 +; GFX10W32-NEXT: .LBB7_50: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_51: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX10W32-NEXT: ; %bb.52: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX10W32-NEXT: s_branch .LBB7_54 +; GFX10W32-NEXT: .LBB7_53: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_54: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX10W32-NEXT: ; %bb.55: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX10W32-NEXT: s_branch .LBB7_57 +; GFX10W32-NEXT: .LBB7_56: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_57: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX10W32-NEXT: ; %bb.58: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX10W32-NEXT: s_branch .LBB7_60 +; GFX10W32-NEXT: .LBB7_59: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_60: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX10W32-NEXT: ; %bb.61: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX10W32-NEXT: s_branch .LBB7_63 +; GFX10W32-NEXT: .LBB7_62: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_63: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX10W32-NEXT: ; %bb.64: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX10W32-NEXT: s_branch .LBB7_66 +; GFX10W32-NEXT: .LBB7_65: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_66: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX10W32-NEXT: ; %bb.67: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX10W32-NEXT: s_branch .LBB7_69 +; GFX10W32-NEXT: .LBB7_68: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_69: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX10W32-NEXT: ; %bb.70: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX10W32-NEXT: s_branch .LBB7_72 +; GFX10W32-NEXT: .LBB7_71: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_72: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX10W32-NEXT: ; %bb.73: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX10W32-NEXT: s_branch .LBB7_75 +; GFX10W32-NEXT: .LBB7_74: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_75: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX10W32-NEXT: ; %bb.76: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX10W32-NEXT: s_branch .LBB7_78 +; GFX10W32-NEXT: .LBB7_77: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_78: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX10W32-NEXT: ; %bb.79: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX10W32-NEXT: s_branch .LBB7_81 +; GFX10W32-NEXT: .LBB7_80: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_81: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX10W32-NEXT: ; %bb.82: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX10W32-NEXT: s_branch .LBB7_84 +; GFX10W32-NEXT: .LBB7_83: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_84: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX10W32-NEXT: ; %bb.85: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX10W32-NEXT: s_branch .LBB7_87 +; GFX10W32-NEXT: .LBB7_86: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_87: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX10W32-NEXT: ; %bb.88: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX10W32-NEXT: s_branch .LBB7_90 +; GFX10W32-NEXT: .LBB7_89: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_90: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX10W32-NEXT: ; %bb.91: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX10W32-NEXT: s_branch .LBB7_93 +; GFX10W32-NEXT: .LBB7_92: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_93: +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX10W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX10W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX10W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX10W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX10W32-NEXT: ; %bb.94: +; GFX10W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX10W32-NEXT: s_branch .LBB7_96 +; GFX10W32-NEXT: .LBB7_95: +; GFX10W32-NEXT: ; implicit-def: $vgpr1 +; GFX10W32-NEXT: .LBB7_96: +; GFX10W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX10W32-NEXT: ; implicit-def: $vgpr0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX10W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX10W32-NEXT: s_cbranch_execz .LBB7_98 +; GFX10W32-NEXT: ; %bb.97: +; GFX10W32-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 +; GFX10W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX10W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX10W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX10W32-NEXT: s_add_i32 s2, s2, s3 +; GFX10W32-NEXT: v_mov_b32_e32 v0, s2 +; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX10W32-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc +; GFX10W32-NEXT: .LBB7_98: ; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10W32-NEXT: s_waitcnt vmcnt(0) -; GFX10W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX10W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX10W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10W32-NEXT: global_store_dword v0, v4, s[0:1] +; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10W32-NEXT: s_endpgm ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry +; GFX11W64-NEXT: s_and_b32 s3, exec_lo, 1 +; GFX11W64-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX11W64-NEXT: ; %bb.1: ; GFX11W64-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W64-NEXT: s_not_b64 exec, exec -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX11W64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11W64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 15 -; GFX11W64-NEXT: v_readlane_b32 s5, v1, 31 -; GFX11W64-NEXT: v_writelane_b32 v3, s4, 16 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_readlane_b32 s4, v1, 63 -; GFX11W64-NEXT: v_readlane_b32 s6, v1, 47 -; GFX11W64-NEXT: v_writelane_b32 v3, s5, 32 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] +; GFX11W64-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W64-NEXT: s_branch .LBB7_3 +; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_3: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s6, s2, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 2 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 1 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX11W64-NEXT: ; %bb.4: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 1 +; GFX11W64-NEXT: s_branch .LBB7_6 +; GFX11W64-NEXT: .LBB7_5: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_6: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 4 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX11W64-NEXT: ; %bb.7: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 2 +; GFX11W64-NEXT: s_branch .LBB7_9 +; GFX11W64-NEXT: .LBB7_8: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_9: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 3 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX11W64-NEXT: ; %bb.10: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 3 +; GFX11W64-NEXT: s_branch .LBB7_12 +; GFX11W64-NEXT: .LBB7_11: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_12: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 4 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 16 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX11W64-NEXT: ; %bb.13: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 4 +; GFX11W64-NEXT: s_branch .LBB7_15 +; GFX11W64-NEXT: .LBB7_14: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_15: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 5 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX11W64-NEXT: ; %bb.16: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 5 +; GFX11W64-NEXT: s_branch .LBB7_18 +; GFX11W64-NEXT: .LBB7_17: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_18: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 6 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 64 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX11W64-NEXT: ; %bb.19: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 6 +; GFX11W64-NEXT: s_branch .LBB7_21 +; GFX11W64-NEXT: .LBB7_20: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_21: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 7 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX11W64-NEXT: ; %bb.22: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 7 +; GFX11W64-NEXT: s_branch .LBB7_24 +; GFX11W64-NEXT: .LBB7_23: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_24: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX11W64-NEXT: ; %bb.25: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 8 +; GFX11W64-NEXT: s_branch .LBB7_27 +; GFX11W64-NEXT: .LBB7_26: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_27: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 9 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX11W64-NEXT: ; %bb.28: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 9 +; GFX11W64-NEXT: s_branch .LBB7_30 +; GFX11W64-NEXT: .LBB7_29: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_30: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 10 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX11W64-NEXT: ; %bb.31: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 10 +; GFX11W64-NEXT: s_branch .LBB7_33 +; GFX11W64-NEXT: .LBB7_32: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_33: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 11 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX11W64-NEXT: ; %bb.34: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 11 +; GFX11W64-NEXT: s_branch .LBB7_36 +; GFX11W64-NEXT: .LBB7_35: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_36: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 12 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX11W64-NEXT: ; %bb.37: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 12 +; GFX11W64-NEXT: s_branch .LBB7_39 +; GFX11W64-NEXT: .LBB7_38: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_39: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 13 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX11W64-NEXT: ; %bb.40: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 13 +; GFX11W64-NEXT: s_branch .LBB7_42 +; GFX11W64-NEXT: .LBB7_41: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_42: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 14 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX11W64-NEXT: ; %bb.43: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 14 +; GFX11W64-NEXT: s_branch .LBB7_45 +; GFX11W64-NEXT: .LBB7_44: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_45: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 15 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX11W64-NEXT: ; %bb.46: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 15 +; GFX11W64-NEXT: s_branch .LBB7_48 +; GFX11W64-NEXT: .LBB7_47: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_48: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 16 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX11W64-NEXT: ; %bb.49: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 16 +; GFX11W64-NEXT: s_branch .LBB7_51 +; GFX11W64-NEXT: .LBB7_50: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_51: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 17 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX11W64-NEXT: ; %bb.52: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 17 +; GFX11W64-NEXT: s_branch .LBB7_54 +; GFX11W64-NEXT: .LBB7_53: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_54: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 18 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX11W64-NEXT: ; %bb.55: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 18 +; GFX11W64-NEXT: s_branch .LBB7_57 +; GFX11W64-NEXT: .LBB7_56: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_57: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 19 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX11W64-NEXT: ; %bb.58: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 19 +; GFX11W64-NEXT: s_branch .LBB7_60 +; GFX11W64-NEXT: .LBB7_59: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_60: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 20 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX11W64-NEXT: ; %bb.61: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 20 +; GFX11W64-NEXT: s_branch .LBB7_63 +; GFX11W64-NEXT: .LBB7_62: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_63: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 21 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX11W64-NEXT: ; %bb.64: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 21 +; GFX11W64-NEXT: s_branch .LBB7_66 +; GFX11W64-NEXT: .LBB7_65: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_66: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 22 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX11W64-NEXT: ; %bb.67: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 22 +; GFX11W64-NEXT: s_branch .LBB7_69 +; GFX11W64-NEXT: .LBB7_68: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_69: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 23 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX11W64-NEXT: ; %bb.70: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 23 +; GFX11W64-NEXT: s_branch .LBB7_72 +; GFX11W64-NEXT: .LBB7_71: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_72: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 24 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX11W64-NEXT: ; %bb.73: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 24 +; GFX11W64-NEXT: s_branch .LBB7_75 +; GFX11W64-NEXT: .LBB7_74: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_75: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 25 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX11W64-NEXT: ; %bb.76: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 25 +; GFX11W64-NEXT: s_branch .LBB7_78 +; GFX11W64-NEXT: .LBB7_77: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_78: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 26 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX11W64-NEXT: ; %bb.79: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 26 +; GFX11W64-NEXT: s_branch .LBB7_81 +; GFX11W64-NEXT: .LBB7_80: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_81: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 27 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX11W64-NEXT: ; %bb.82: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 27 +; GFX11W64-NEXT: s_branch .LBB7_84 +; GFX11W64-NEXT: .LBB7_83: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_84: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 28 +; GFX11W64-NEXT: s_add_i32 s6, s6, s2 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s3, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX11W64-NEXT: ; %bb.85: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 28 +; GFX11W64-NEXT: s_branch .LBB7_87 +; GFX11W64-NEXT: .LBB7_86: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_87: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s2, exec_lo, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 29 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX11W64-NEXT: ; %bb.88: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 29 +; GFX11W64-NEXT: s_branch .LBB7_90 +; GFX11W64-NEXT: .LBB7_89: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_90: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s5, v0, 30 +; GFX11W64-NEXT: s_add_i32 s4, s6, s2 +; GFX11W64-NEXT: s_and_b32 s6, exec_lo, 2.0 +; GFX11W64-NEXT: s_mov_b32 s7, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX11W64-NEXT: ; %bb.91: +; GFX11W64-NEXT: v_writelane_b32 v1, s4, 30 +; GFX11W64-NEXT: s_branch .LBB7_93 +; GFX11W64-NEXT: .LBB7_92: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_93: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s2, s5, 0 ; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 -; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX11W64-NEXT: v_writelane_b32 v3, s6, 48 -; GFX11W64-NEXT: s_mov_b64 exec, s[2:3] -; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX11W64-NEXT: ; implicit-def: $vgpr4 +; GFX11W64-NEXT: s_add_i32 s6, s4, s2 +; GFX11W64-NEXT: v_readlane_b32 s4, v0, 31 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W64-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX11W64-NEXT: s_cmp_gt_i32 exec_lo, -1 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX11W64-NEXT: ; %bb.94: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 31 +; GFX11W64-NEXT: s_branch .LBB7_96 +; GFX11W64-NEXT: .LBB7_95: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_96: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s4, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 1 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 0 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_98 +; GFX11W64-NEXT: ; %bb.97: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 32 +; GFX11W64-NEXT: s_branch .LBB7_99 +; GFX11W64-NEXT: .LBB7_98: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_99: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 33 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 1 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_101 +; GFX11W64-NEXT: ; %bb.100: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 33 +; GFX11W64-NEXT: s_branch .LBB7_102 +; GFX11W64-NEXT: .LBB7_101: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_102: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 34 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 4 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 2 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_104 +; GFX11W64-NEXT: ; %bb.103: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 34 +; GFX11W64-NEXT: s_branch .LBB7_105 +; GFX11W64-NEXT: .LBB7_104: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_105: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 8 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 35 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 3 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_107 +; GFX11W64-NEXT: ; %bb.106: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 35 +; GFX11W64-NEXT: s_branch .LBB7_108 +; GFX11W64-NEXT: .LBB7_107: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_108: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 36 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 16 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 4 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_110 +; GFX11W64-NEXT: ; %bb.109: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 36 +; GFX11W64-NEXT: s_branch .LBB7_111 +; GFX11W64-NEXT: .LBB7_110: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_111: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 32 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 37 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 5 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_113 +; GFX11W64-NEXT: ; %bb.112: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 37 +; GFX11W64-NEXT: s_branch .LBB7_114 +; GFX11W64-NEXT: .LBB7_113: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_114: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 38 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 64 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 6 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_116 +; GFX11W64-NEXT: ; %bb.115: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 38 +; GFX11W64-NEXT: s_branch .LBB7_117 +; GFX11W64-NEXT: .LBB7_116: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_117: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 39 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 7 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_119 +; GFX11W64-NEXT: ; %bb.118: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 39 +; GFX11W64-NEXT: s_branch .LBB7_120 +; GFX11W64-NEXT: .LBB7_119: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_120: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 40 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 8 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_122 +; GFX11W64-NEXT: ; %bb.121: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 40 +; GFX11W64-NEXT: s_branch .LBB7_123 +; GFX11W64-NEXT: .LBB7_122: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_123: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 41 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 9 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_125 +; GFX11W64-NEXT: ; %bb.124: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 41 +; GFX11W64-NEXT: s_branch .LBB7_126 +; GFX11W64-NEXT: .LBB7_125: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_126: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 42 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 10 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_128 +; GFX11W64-NEXT: ; %bb.127: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 42 +; GFX11W64-NEXT: s_branch .LBB7_129 +; GFX11W64-NEXT: .LBB7_128: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_129: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 43 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 11 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_131 +; GFX11W64-NEXT: ; %bb.130: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 43 +; GFX11W64-NEXT: s_branch .LBB7_132 +; GFX11W64-NEXT: .LBB7_131: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_132: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 44 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 12 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_134 +; GFX11W64-NEXT: ; %bb.133: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 44 +; GFX11W64-NEXT: s_branch .LBB7_135 +; GFX11W64-NEXT: .LBB7_134: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_135: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 45 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 13 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_137 +; GFX11W64-NEXT: ; %bb.136: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 45 +; GFX11W64-NEXT: s_branch .LBB7_138 +; GFX11W64-NEXT: .LBB7_137: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_138: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 46 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 14 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_140 +; GFX11W64-NEXT: ; %bb.139: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 46 +; GFX11W64-NEXT: s_branch .LBB7_141 +; GFX11W64-NEXT: .LBB7_140: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_141: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 47 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 15 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_143 +; GFX11W64-NEXT: ; %bb.142: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 47 +; GFX11W64-NEXT: s_branch .LBB7_144 +; GFX11W64-NEXT: .LBB7_143: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_144: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 48 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 16 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_146 +; GFX11W64-NEXT: ; %bb.145: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 48 +; GFX11W64-NEXT: s_branch .LBB7_147 +; GFX11W64-NEXT: .LBB7_146: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_147: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 49 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 17 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_149 +; GFX11W64-NEXT: ; %bb.148: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 49 +; GFX11W64-NEXT: s_branch .LBB7_150 +; GFX11W64-NEXT: .LBB7_149: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_150: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 50 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x40000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 18 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_152 +; GFX11W64-NEXT: ; %bb.151: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 50 +; GFX11W64-NEXT: s_branch .LBB7_153 +; GFX11W64-NEXT: .LBB7_152: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_153: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 51 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 19 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_155 +; GFX11W64-NEXT: ; %bb.154: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 51 +; GFX11W64-NEXT: s_branch .LBB7_156 +; GFX11W64-NEXT: .LBB7_155: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_156: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 52 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x100000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 20 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_158 +; GFX11W64-NEXT: ; %bb.157: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 52 +; GFX11W64-NEXT: s_branch .LBB7_159 +; GFX11W64-NEXT: .LBB7_158: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_159: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x200000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 53 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 21 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_161 +; GFX11W64-NEXT: ; %bb.160: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 53 +; GFX11W64-NEXT: s_branch .LBB7_162 +; GFX11W64-NEXT: .LBB7_161: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_162: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 54 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x400000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 22 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_164 +; GFX11W64-NEXT: ; %bb.163: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 54 +; GFX11W64-NEXT: s_branch .LBB7_165 +; GFX11W64-NEXT: .LBB7_164: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_165: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x800000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 55 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 23 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_167 +; GFX11W64-NEXT: ; %bb.166: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 55 +; GFX11W64-NEXT: s_branch .LBB7_168 +; GFX11W64-NEXT: .LBB7_167: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_168: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 56 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x1000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 24 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_170 +; GFX11W64-NEXT: ; %bb.169: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 56 +; GFX11W64-NEXT: s_branch .LBB7_171 +; GFX11W64-NEXT: .LBB7_170: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_171: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x2000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 57 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 25 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_173 +; GFX11W64-NEXT: ; %bb.172: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 57 +; GFX11W64-NEXT: s_branch .LBB7_174 +; GFX11W64-NEXT: .LBB7_173: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_174: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 58 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x4000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 26 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_176 +; GFX11W64-NEXT: ; %bb.175: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 58 +; GFX11W64-NEXT: s_branch .LBB7_177 +; GFX11W64-NEXT: .LBB7_176: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_177: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x8000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 59 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 27 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_179 +; GFX11W64-NEXT: ; %bb.178: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 59 +; GFX11W64-NEXT: s_branch .LBB7_180 +; GFX11W64-NEXT: .LBB7_179: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_180: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 60 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x10000000 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 28 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_182 +; GFX11W64-NEXT: ; %bb.181: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 60 +; GFX11W64-NEXT: s_branch .LBB7_183 +; GFX11W64-NEXT: .LBB7_182: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_183: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x20000000 +; GFX11W64-NEXT: s_add_i32 s6, s6, s4 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 61 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 29 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_185 +; GFX11W64-NEXT: ; %bb.184: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 61 +; GFX11W64-NEXT: s_branch .LBB7_186 +; GFX11W64-NEXT: .LBB7_185: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_186: +; GFX11W64-NEXT: s_and_b64 s[2:3], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 2.0 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 62 +; GFX11W64-NEXT: s_mov_b32 s2, 0 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 30 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_188 +; GFX11W64-NEXT: ; %bb.187: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 62 +; GFX11W64-NEXT: s_branch .LBB7_189 +; GFX11W64-NEXT: .LBB7_188: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_189: +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s3, s7, 0 +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX11W64-NEXT: s_add_i32 s6, s6, s3 +; GFX11W64-NEXT: s_and_b32 s3, exec_hi, 0x80000000 +; GFX11W64-NEXT: v_readlane_b32 s7, v0, 63 +; GFX11W64-NEXT: s_bitcmp1_b32 exec_hi, 31 +; GFX11W64-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11W64-NEXT: s_cmp_eq_u64 s[2:3], 0 +; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_191 +; GFX11W64-NEXT: ; %bb.190: +; GFX11W64-NEXT: v_writelane_b32 v1, s6, 63 +; GFX11W64-NEXT: s_branch .LBB7_192 +; GFX11W64-NEXT: .LBB7_191: +; GFX11W64-NEXT: ; implicit-def: $vgpr1 +; GFX11W64-NEXT: .LBB7_192: +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11W64-NEXT: ; implicit-def: $vgpr0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX11W64-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W64-NEXT: ; %bb.1: +; GFX11W64-NEXT: s_cbranch_execz .LBB7_194 +; GFX11W64-NEXT: ; %bb.193: ; GFX11W64-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 -; GFX11W64-NEXT: v_mov_b32_e32 v4, s4 +; GFX11W64-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX11W64-NEXT: s_cselect_b32 s4, s7, 0 +; GFX11W64-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W64-NEXT: s_add_i32 s4, s6, s4 +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11W64-NEXT: v_mov_b32_e32 v0, s4 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: buffer_atomic_sub_u32 v4, v0, s[8:11], 0 idxen glc -; GFX11W64-NEXT: .LBB7_2: +; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W64-NEXT: .LBB7_194: ; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W64-NEXT: s_waitcnt vmcnt(0) -; GFX11W64-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W64-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W64-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W64-NEXT: s_endpgm ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry +; GFX11W32-NEXT: s_and_b32 s4, exec_lo, 1 +; GFX11W32-NEXT: v_readlane_b32 s2, v0, 0 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 0 +; GFX11W32-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_2 +; GFX11W32-NEXT: ; %bb.1: ; GFX11W32-NEXT: v_mov_b32_e32 v1, v0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX11W32-NEXT: s_not_b32 exec_lo, exec_lo -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_mov_b32_e32 v3, 0 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_mov_b32_e32 v2, v1 -; GFX11W32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11W32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s4, v1, 31 -; GFX11W32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX11W32-NEXT: v_readlane_b32 s3, v1, 15 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 +; GFX11W32-NEXT: v_writelane_b32 v1, 0, 0 +; GFX11W32-NEXT: s_branch .LBB7_3 +; GFX11W32-NEXT: .LBB7_2: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_3: +; GFX11W32-NEXT: s_and_b32 s3, s3, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s2, s2, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 1 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 1 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_5 +; GFX11W32-NEXT: ; %bb.4: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 1 +; GFX11W32-NEXT: s_branch .LBB7_6 +; GFX11W32-NEXT: .LBB7_5: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_6: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 4 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 2 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 2 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_8 +; GFX11W32-NEXT: ; %bb.7: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 2 +; GFX11W32-NEXT: s_branch .LBB7_9 +; GFX11W32-NEXT: .LBB7_8: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_9: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 8 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 3 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 3 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_11 +; GFX11W32-NEXT: ; %bb.10: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 3 +; GFX11W32-NEXT: s_branch .LBB7_12 +; GFX11W32-NEXT: .LBB7_11: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_12: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 16 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 4 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 4 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_14 +; GFX11W32-NEXT: ; %bb.13: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 4 +; GFX11W32-NEXT: s_branch .LBB7_15 +; GFX11W32-NEXT: .LBB7_14: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_15: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 32 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 5 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 5 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_17 +; GFX11W32-NEXT: ; %bb.16: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 5 +; GFX11W32-NEXT: s_branch .LBB7_18 +; GFX11W32-NEXT: .LBB7_17: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_18: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 64 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 6 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 6 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_20 +; GFX11W32-NEXT: ; %bb.19: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 6 +; GFX11W32-NEXT: s_branch .LBB7_21 +; GFX11W32-NEXT: .LBB7_20: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_21: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 7 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 7 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_23 +; GFX11W32-NEXT: ; %bb.22: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 7 +; GFX11W32-NEXT: s_branch .LBB7_24 +; GFX11W32-NEXT: .LBB7_23: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_24: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 8 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 8 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_26 +; GFX11W32-NEXT: ; %bb.25: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 8 +; GFX11W32-NEXT: s_branch .LBB7_27 +; GFX11W32-NEXT: .LBB7_26: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_27: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 9 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 9 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_29 +; GFX11W32-NEXT: ; %bb.28: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 9 +; GFX11W32-NEXT: s_branch .LBB7_30 +; GFX11W32-NEXT: .LBB7_29: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_30: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 10 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 10 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_32 +; GFX11W32-NEXT: ; %bb.31: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 10 +; GFX11W32-NEXT: s_branch .LBB7_33 +; GFX11W32-NEXT: .LBB7_32: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_33: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 11 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 11 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_35 +; GFX11W32-NEXT: ; %bb.34: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 11 +; GFX11W32-NEXT: s_branch .LBB7_36 +; GFX11W32-NEXT: .LBB7_35: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_36: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 12 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 12 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_38 +; GFX11W32-NEXT: ; %bb.37: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 12 +; GFX11W32-NEXT: s_branch .LBB7_39 +; GFX11W32-NEXT: .LBB7_38: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_39: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 13 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 13 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_41 +; GFX11W32-NEXT: ; %bb.40: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 13 +; GFX11W32-NEXT: s_branch .LBB7_42 +; GFX11W32-NEXT: .LBB7_41: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_42: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 14 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 14 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_44 +; GFX11W32-NEXT: ; %bb.43: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 14 +; GFX11W32-NEXT: s_branch .LBB7_45 +; GFX11W32-NEXT: .LBB7_44: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_45: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 15 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 15 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_47 +; GFX11W32-NEXT: ; %bb.46: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 15 +; GFX11W32-NEXT: s_branch .LBB7_48 +; GFX11W32-NEXT: .LBB7_47: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_48: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 16 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 16 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_50 +; GFX11W32-NEXT: ; %bb.49: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 16 +; GFX11W32-NEXT: s_branch .LBB7_51 +; GFX11W32-NEXT: .LBB7_50: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_51: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 17 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 17 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_53 +; GFX11W32-NEXT: ; %bb.52: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 17 +; GFX11W32-NEXT: s_branch .LBB7_54 +; GFX11W32-NEXT: .LBB7_53: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_54: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x40000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 18 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 18 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_56 +; GFX11W32-NEXT: ; %bb.55: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 18 +; GFX11W32-NEXT: s_branch .LBB7_57 +; GFX11W32-NEXT: .LBB7_56: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_57: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 19 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 19 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_59 +; GFX11W32-NEXT: ; %bb.58: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 19 +; GFX11W32-NEXT: s_branch .LBB7_60 +; GFX11W32-NEXT: .LBB7_59: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_60: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x100000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 20 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 20 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_62 +; GFX11W32-NEXT: ; %bb.61: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 20 +; GFX11W32-NEXT: s_branch .LBB7_63 +; GFX11W32-NEXT: .LBB7_62: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_63: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x200000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 21 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 21 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_65 +; GFX11W32-NEXT: ; %bb.64: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 21 +; GFX11W32-NEXT: s_branch .LBB7_66 +; GFX11W32-NEXT: .LBB7_65: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_66: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x400000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 22 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 22 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_68 +; GFX11W32-NEXT: ; %bb.67: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 22 +; GFX11W32-NEXT: s_branch .LBB7_69 +; GFX11W32-NEXT: .LBB7_68: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_69: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x800000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 23 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 23 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_71 +; GFX11W32-NEXT: ; %bb.70: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 23 +; GFX11W32-NEXT: s_branch .LBB7_72 +; GFX11W32-NEXT: .LBB7_71: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_72: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x1000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 24 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 24 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_74 +; GFX11W32-NEXT: ; %bb.73: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 24 +; GFX11W32-NEXT: s_branch .LBB7_75 +; GFX11W32-NEXT: .LBB7_74: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_75: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x2000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 25 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 25 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_77 +; GFX11W32-NEXT: ; %bb.76: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 25 +; GFX11W32-NEXT: s_branch .LBB7_78 +; GFX11W32-NEXT: .LBB7_77: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_78: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x4000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 26 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 26 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_80 +; GFX11W32-NEXT: ; %bb.79: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 26 +; GFX11W32-NEXT: s_branch .LBB7_81 +; GFX11W32-NEXT: .LBB7_80: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_81: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x8000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 27 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 27 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_83 +; GFX11W32-NEXT: ; %bb.82: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 27 +; GFX11W32-NEXT: s_branch .LBB7_84 +; GFX11W32-NEXT: .LBB7_83: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_84: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x10000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 28 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 28 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_86 +; GFX11W32-NEXT: ; %bb.85: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 28 +; GFX11W32-NEXT: s_branch .LBB7_87 +; GFX11W32-NEXT: .LBB7_86: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_87: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x20000000 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 29 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 29 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_89 +; GFX11W32-NEXT: ; %bb.88: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 29 +; GFX11W32-NEXT: s_branch .LBB7_90 +; GFX11W32-NEXT: .LBB7_89: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_90: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 2.0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 30 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 30 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_92 +; GFX11W32-NEXT: ; %bb.91: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 30 +; GFX11W32-NEXT: s_branch .LBB7_93 +; GFX11W32-NEXT: .LBB7_92: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_93: +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 +; GFX11W32-NEXT: s_and_b32 s5, exec_lo, 0x80000000 +; GFX11W32-NEXT: v_readlane_b32 s3, v0, 31 +; GFX11W32-NEXT: s_bitcmp1_b32 exec_lo, 31 +; GFX11W32-NEXT: s_cselect_b32 s4, -1, 0 +; GFX11W32-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_95 +; GFX11W32-NEXT: ; %bb.94: +; GFX11W32-NEXT: v_writelane_b32 v1, s2, 31 +; GFX11W32-NEXT: s_branch .LBB7_96 +; GFX11W32-NEXT: .LBB7_95: +; GFX11W32-NEXT: ; implicit-def: $vgpr1 +; GFX11W32-NEXT: .LBB7_96: +; GFX11W32-NEXT: v_readfirstlane_b32 s5, v2 +; GFX11W32-NEXT: ; implicit-def: $vgpr0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s5, vcc_lo +; GFX11W32-NEXT: s_cbranch_execz .LBB7_98 +; GFX11W32-NEXT: ; %bb.97: +; GFX11W32-NEXT: s_load_b128 s[8:11], s[0:1], 0x34 +; GFX11W32-NEXT: s_and_b32 s4, s4, exec_lo +; GFX11W32-NEXT: s_cselect_b32 s3, s3, 0 +; GFX11W32-NEXT: v_mov_b32_e32 v2, 0 +; GFX11W32-NEXT: s_add_i32 s2, s2, s3 ; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 -; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX11W32-NEXT: s_or_saveexec_b32 s2, -1 -; GFX11W32-NEXT: v_writelane_b32 v3, s3, 16 -; GFX11W32-NEXT: s_mov_b32 exec_lo, s2 -; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11W32-NEXT: ; implicit-def: $vgpr4 -; GFX11W32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX11W32-NEXT: s_cbranch_execz .LBB7_2 -; GFX11W32-NEXT: ; %bb.1: -; GFX11W32-NEXT: s_mov_b32 s3, s4 -; GFX11W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34 -; GFX11W32-NEXT: v_mov_b32_e32 v4, s3 +; GFX11W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: buffer_atomic_sub_u32 v4, v0, s[4:7], 0 idxen glc -; GFX11W32-NEXT: .LBB7_2: -; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc +; GFX11W32-NEXT: .LBB7_98: +; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11W32-NEXT: s_waitcnt vmcnt(0) -; GFX11W32-NEXT: v_readfirstlane_b32 s2, v4 -; GFX11W32-NEXT: v_mov_b32_e32 v4, v3 -; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W32-NEXT: v_sub_nc_u32_e32 v4, s2, v4 +; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX11W32-NEXT: global_store_b32 v0, v4, s[0:1] +; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11W32-NEXT: s_endpgm entry: Index: llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll +++ llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll @@ -4,6 +4,7 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) dereferenceable(18446744073709551615) %arg0, i32 %arg1) { ; GCN-LABEL: name: mmo_offsets0 ; GCN: bb.0.bb.0: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) ; GCN-NEXT: liveins: $sgpr0, $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -32,165 +33,545 @@ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 80, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY2]].sub0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY2]].sub1 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY3]], [[COPY5]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY4]], killed [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[S_MOV_B32_]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY6]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_]], killed [[V_MOV_B32_dpp]], 0, implicit $exec + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_ADD_U32_e64_]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], killed [[V_MOV_B32_dpp1]], 0, implicit $exec + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[V_ADD_U32_e64_1]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_1]], killed [[V_MOV_B32_dpp2]], 0, implicit $exec + ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY9]], [[V_ADD_U32_e64_2]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_2]], killed [[V_MOV_B32_dpp3]], 0, implicit $exec + ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY10]], [[V_ADD_U32_e64_3]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_3]], killed [[V_MOV_B32_dpp4]], 0, implicit $exec + ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY11]], [[V_ADD_U32_e64_4]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_4]], killed [[V_MOV_B32_dpp5]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_5]], killed [[S_MOV_B32_1]] + ; GCN-NEXT: early-clobber %1:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec + ; GCN-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1 (%ir-block.25): + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY %1 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY12]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_2]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 80, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2 (%ir-block.27): + ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.4(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY13:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY13]].sub0 + ; GCN-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY13]].sub1 + ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_1:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY14]], [[COPY16]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_1:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY15]], killed [[V_MBCNT_LO_U32_B32_e64_1]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_1:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[S_MOV_B32_3]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp6:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_SET_INACTIVE_B32_1]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_1]], killed [[V_MOV_B32_dpp6]], 0, implicit $exec + ; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp7:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY18]], [[V_ADD_U32_e64_6]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_6]], killed [[V_MOV_B32_dpp7]], 0, implicit $exec + ; GCN-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp8:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY19]], [[V_ADD_U32_e64_7]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_8:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_7]], killed [[V_MOV_B32_dpp8]], 0, implicit $exec + ; GCN-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp9:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY20]], [[V_ADD_U32_e64_8]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_9:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_8]], killed [[V_MOV_B32_dpp9]], 0, implicit $exec + ; GCN-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp10:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY21]], [[V_ADD_U32_e64_9]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_10:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_9]], killed [[V_MOV_B32_dpp10]], 0, implicit $exec + ; GCN-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; GCN-NEXT: [[V_MOV_B32_dpp11:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY22]], [[V_ADD_U32_e64_10]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_11:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_10]], killed [[V_MOV_B32_dpp11]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_1:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_11]], killed [[S_MOV_B32_4]] + ; GCN-NEXT: early-clobber %3:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_1]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_1]], [[S_MOV_B32_3]], implicit $exec + ; GCN-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_1]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3 (%ir-block.53): + ; GCN-NEXT: successors: %bb.4(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY %3 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY23]], killed [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.4 (%ir-block.55): + ; GCN-NEXT: successors: %bb.5(0x40000000), %bb.6(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF1]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 96, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 96, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 112, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 112, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_3]], [[V_MOV_B32_e32_2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY2]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) + ; GCN-NEXT: [[COPY24:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET1]] + ; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 64 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) + ; GCN-NEXT: [[COPY25:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET2]] + ; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 128 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 128, align 1, addrspace 7) + ; GCN-NEXT: [[COPY26:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET3]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY27]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144 - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY3]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) + ; GCN-NEXT: [[COPY28:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]] + ; GCN-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 72 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) + ; GCN-NEXT: [[COPY29:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]] + ; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 144 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 144, align 1, addrspace 7) + ; GCN-NEXT: [[COPY30:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80 - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160 - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY4]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY32:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY33:%[0-9]+]]:sreg_32 = COPY [[COPY32]].sub0 + ; GCN-NEXT: [[COPY34:%[0-9]+]]:sreg_32 = COPY [[COPY32]].sub1 + ; GCN-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_2:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY33]], [[COPY35]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_2:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY34]], killed [[V_MBCNT_LO_U32_B32_e64_2]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_2:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[S_MOV_B32_6]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp12:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY36]], [[V_SET_INACTIVE_B32_2]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_12:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_2]], killed [[V_MOV_B32_dpp12]], 0, implicit $exec + ; GCN-NEXT: [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp13:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY37]], [[V_ADD_U32_e64_12]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_13:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_12]], killed [[V_MOV_B32_dpp13]], 0, implicit $exec + ; GCN-NEXT: [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp14:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY38]], [[V_ADD_U32_e64_13]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_14:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_13]], killed [[V_MOV_B32_dpp14]], 0, implicit $exec + ; GCN-NEXT: [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp15:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY39]], [[V_ADD_U32_e64_14]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_15:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_14]], killed [[V_MOV_B32_dpp15]], 0, implicit $exec + ; GCN-NEXT: [[COPY40:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp16:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY40]], [[V_ADD_U32_e64_15]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_16:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_15]], killed [[V_MOV_B32_dpp16]], 0, implicit $exec + ; GCN-NEXT: [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_6]] + ; GCN-NEXT: [[V_MOV_B32_dpp17:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY41]], [[V_ADD_U32_e64_16]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_17:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_16]], killed [[V_MOV_B32_dpp17]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_2:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_17]], killed [[S_MOV_B32_11]] + ; GCN-NEXT: early-clobber %15:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_2]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_2:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_2]], [[S_MOV_B32_6]], implicit $exec + ; GCN-NEXT: [[SI_IF2:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_2]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.5 (%ir-block.81): + ; GCN-NEXT: successors: %bb.6(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[COPY42:%[0-9]+]]:vgpr_32 = COPY %15 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY42]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_12]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.6 (%ir-block.83): + ; GCN-NEXT: successors: %bb.7(0x40000000), %bb.8(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY43:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY44:%[0-9]+]]:sreg_32 = COPY [[COPY43]].sub0 + ; GCN-NEXT: [[COPY45:%[0-9]+]]:sreg_32 = COPY [[COPY43]].sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_3:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY44]], [[V_MOV_B32_e32_4]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_3:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY45]], killed [[V_MBCNT_LO_U32_B32_e64_3]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_3:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_4]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp18:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_SET_INACTIVE_B32_3]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_18:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_3]], killed [[V_MOV_B32_dpp18]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp19:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_ADD_U32_e64_18]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_19:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_18]], killed [[V_MOV_B32_dpp19]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp20:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_ADD_U32_e64_19]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_20:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_19]], killed [[V_MOV_B32_dpp20]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp21:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_ADD_U32_e64_20]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_21:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_20]], killed [[V_MOV_B32_dpp21]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp22:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_ADD_U32_e64_21]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_22:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_21]], killed [[V_MOV_B32_dpp22]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp23:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_4]], [[V_ADD_U32_e64_22]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_23:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_22]], killed [[V_MOV_B32_dpp23]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_3:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_23]], killed [[S_MOV_B32_13]] + ; GCN-NEXT: early-clobber %17:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_3]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_3:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_3]], [[V_MOV_B32_e32_4]], implicit $exec + ; GCN-NEXT: [[SI_IF3:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_3]], %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.7 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.7 (%ir-block.109): + ; GCN-NEXT: successors: %bb.8(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 80 + ; GCN-NEXT: [[COPY46:%[0-9]+]]:vgpr_32 = COPY %17 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY46]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_14]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.8 (%ir-block.111): + ; GCN-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY47:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY47]].sub0 + ; GCN-NEXT: [[COPY49:%[0-9]+]]:sreg_32 = COPY [[COPY47]].sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_4:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY48]], [[V_MOV_B32_e32_5]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_4:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY49]], killed [[V_MBCNT_LO_U32_B32_e64_4]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_4:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_5]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp24:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_SET_INACTIVE_B32_4]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_24:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_4]], killed [[V_MOV_B32_dpp24]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp25:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_ADD_U32_e64_24]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_25:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_24]], killed [[V_MOV_B32_dpp25]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp26:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_ADD_U32_e64_25]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_26:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_25]], killed [[V_MOV_B32_dpp26]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp27:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_ADD_U32_e64_26]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_26]], killed [[V_MOV_B32_dpp27]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp28:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_ADD_U32_e64_27]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_27]], killed [[V_MOV_B32_dpp28]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp29:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_5]], [[V_ADD_U32_e64_28]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_28]], killed [[V_MOV_B32_dpp29]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_4:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_29]], killed [[S_MOV_B32_15]] + ; GCN-NEXT: early-clobber %19:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_4]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_4:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_4]], [[V_MOV_B32_e32_5]], implicit $exec + ; GCN-NEXT: [[SI_IF4:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_4]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.9 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.9 (%ir-block.137): + ; GCN-NEXT: successors: %bb.10(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 160 + ; GCN-NEXT: [[COPY50:%[0-9]+]]:vgpr_32 = COPY %19 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_16]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 160, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.10 (%ir-block.139): + ; GCN-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF4]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 160 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY51:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY51]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176 - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY5]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[COPY52:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[COPY52]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 88 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 176 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 176, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE2]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192 - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY54:%[0-9]+]]:vreg_128 = COPY [[COPY24]] + ; GCN-NEXT: [[COPY55:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[COPY54]], [[S_LOAD_DWORDX4_IMM]], [[COPY55]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 96 + ; GCN-NEXT: [[COPY56:%[0-9]+]]:vreg_128 = COPY [[COPY25]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[COPY56]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_20]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 192 + ; GCN-NEXT: [[COPY57:%[0-9]+]]:vreg_128 = COPY [[COPY26]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[COPY57]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 192, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY58:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY58]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208 - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY59:%[0-9]+]]:vreg_128 = COPY [[COPY28]] + ; GCN-NEXT: [[COPY60:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[COPY60]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 104 + ; GCN-NEXT: [[COPY61:%[0-9]+]]:vreg_128 = COPY [[COPY29]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact [[COPY61]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_22]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sreg_32 = S_MOV_B32 208 + ; GCN-NEXT: [[COPY62:%[0-9]+]]:vreg_128 = COPY [[COPY30]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact [[COPY62]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 208, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY63:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY63]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY8]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112 - ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY9]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224 - ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY10]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) - ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1 - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY11]], [[S_LOAD_DWORDX4_IMM]], [[COPY12]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY64:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[COPY64]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) + ; GCN-NEXT: [[COPY65:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN2]] + ; GCN-NEXT: [[S_MOV_B32_24:%[0-9]+]]:sreg_32 = S_MOV_B32 112 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_24]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) + ; GCN-NEXT: [[COPY66:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN3]] + ; GCN-NEXT: [[S_MOV_B32_25:%[0-9]+]]:sreg_32 = S_MOV_B32 224 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_25]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 224, align 1, addrspace 7) + ; GCN-NEXT: [[COPY67:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN4]] + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_25]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY68:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[COPY68]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[COPY69:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_7]], [[S_LOAD_DWORDX4_IMM]], [[COPY69]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY70:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN6]] + ; GCN-NEXT: [[COPY71:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY71]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY13]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120 - ; GCN-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY14]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240 - ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY15]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY16]], [[S_LOAD_DWORDX4_IMM]], [[COPY17]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) - ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY72:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) + ; GCN-NEXT: [[COPY73:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]] + ; GCN-NEXT: [[S_MOV_B32_26:%[0-9]+]]:sreg_32 = S_MOV_B32 120 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_26]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) + ; GCN-NEXT: [[COPY74:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]] + ; GCN-NEXT: [[S_MOV_B32_27:%[0-9]+]]:sreg_32 = S_MOV_B32 240 + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_27]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from unknown-address + 240, align 1, addrspace 7) + ; GCN-NEXT: [[COPY75:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_27]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY76:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_6]], [[S_LOAD_DWORDX4_IMM]], [[COPY76]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY77:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_7]], [[S_LOAD_DWORDX4_IMM]], [[COPY77]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY78:%[0-9]+]]:sgpr_128 = COPY [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]] + ; GCN-NEXT: [[COPY79:%[0-9]+]]:sreg_32 = COPY [[V_MOV_B32_e32_6]] + ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY79]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) - ; GCN-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256 - ; GCN-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY22:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY80:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY81:%[0-9]+]]:sreg_32 = COPY [[COPY80]].sub0 + ; GCN-NEXT: [[COPY82:%[0-9]+]]:sreg_32 = COPY [[COPY80]].sub1 + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_5:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY81]], [[V_MOV_B32_e32_6]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_5:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY82]], killed [[V_MBCNT_LO_U32_B32_e64_5]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_5:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_6]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp30:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_SET_INACTIVE_B32_5]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_5]], killed [[V_MOV_B32_dpp30]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp31:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_ADD_U32_e64_30]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_31:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_30]], killed [[V_MOV_B32_dpp31]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp32:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_ADD_U32_e64_31]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_32:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_31]], killed [[V_MOV_B32_dpp32]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp33:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_ADD_U32_e64_32]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_33:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_32]], killed [[V_MOV_B32_dpp33]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp34:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_ADD_U32_e64_33]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_34:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_33]], killed [[V_MOV_B32_dpp34]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp35:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_6]], [[V_ADD_U32_e64_34]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_35:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_34]], killed [[V_MOV_B32_dpp35]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_28:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_5:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_35]], killed [[S_MOV_B32_28]] + ; GCN-NEXT: early-clobber %35:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_5]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_5:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_5]], [[V_MOV_B32_e32_6]], implicit $exec + ; GCN-NEXT: [[SI_IF5:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_5]], %bb.12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.11 (%ir-block.165): + ; GCN-NEXT: successors: %bb.12(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_29:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[COPY83:%[0-9]+]]:vgpr_32 = COPY %35 + ; GCN-NEXT: [[COPY84:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_29]] + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY83]], [[COPY84]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_29]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.12 (%ir-block.167): + ; GCN-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF5]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY85:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY86:%[0-9]+]]:sreg_32 = COPY [[COPY85]].sub0 + ; GCN-NEXT: [[COPY87:%[0-9]+]]:sreg_32 = COPY [[COPY85]].sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_6:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY86]], [[V_MOV_B32_e32_8]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_6:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY87]], killed [[V_MBCNT_LO_U32_B32_e64_6]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_6:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_8]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp36:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_SET_INACTIVE_B32_6]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_36:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_6]], killed [[V_MOV_B32_dpp36]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp37:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_ADD_U32_e64_36]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_37:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_36]], killed [[V_MOV_B32_dpp37]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp38:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_ADD_U32_e64_37]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_38:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_37]], killed [[V_MOV_B32_dpp38]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp39:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_ADD_U32_e64_38]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_39:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_38]], killed [[V_MOV_B32_dpp39]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp40:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_ADD_U32_e64_39]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_40:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_39]], killed [[V_MOV_B32_dpp40]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp41:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_8]], [[V_ADD_U32_e64_40]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_41:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_40]], killed [[V_MOV_B32_dpp41]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_30:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_6:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_41]], killed [[S_MOV_B32_30]] + ; GCN-NEXT: early-clobber %37:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_6]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_6:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_6]], [[V_MOV_B32_e32_8]], implicit $exec + ; GCN-NEXT: [[SI_IF6:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_6]], %bb.14, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.13 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.13 (%ir-block.193): + ; GCN-NEXT: successors: %bb.14(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_31:%[0-9]+]]:sreg_32 = S_MOV_B32 128 + ; GCN-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[COPY88:%[0-9]+]]:vgpr_32 = COPY %37 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY88]], killed [[V_MOV_B32_e32_9]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_31]], 128, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.14 (%ir-block.195): + ; GCN-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF6]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[COPY89:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY90:%[0-9]+]]:sreg_32 = COPY [[COPY89]].sub0 + ; GCN-NEXT: [[COPY91:%[0-9]+]]:sreg_32 = COPY [[COPY89]].sub1 + ; GCN-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_7:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY90]], [[V_MOV_B32_e32_10]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_7:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY91]], killed [[V_MBCNT_LO_U32_B32_e64_7]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_7:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_10]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp42:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_SET_INACTIVE_B32_7]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_42:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_7]], killed [[V_MOV_B32_dpp42]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp43:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_ADD_U32_e64_42]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_43:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_42]], killed [[V_MOV_B32_dpp43]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp44:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_ADD_U32_e64_43]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_44:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_43]], killed [[V_MOV_B32_dpp44]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp45:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_ADD_U32_e64_44]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_45:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_44]], killed [[V_MOV_B32_dpp45]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp46:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_ADD_U32_e64_45]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_46:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_45]], killed [[V_MOV_B32_dpp46]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp47:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_10]], [[V_ADD_U32_e64_46]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_47:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_46]], killed [[V_MOV_B32_dpp47]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_32:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_7:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_47]], killed [[S_MOV_B32_32]] + ; GCN-NEXT: early-clobber %39:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_7]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_7:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_7]], [[V_MOV_B32_e32_10]], implicit $exec + ; GCN-NEXT: [[SI_IF7:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_7]], %bb.16, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.15 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.15 (%ir-block.221): + ; GCN-NEXT: successors: %bb.16(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_33:%[0-9]+]]:sreg_32 = S_MOV_B32 256 + ; GCN-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[COPY92:%[0-9]+]]:vgpr_32 = COPY %39 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY92]], killed [[V_MOV_B32_e32_11]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_33]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 256, align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.16 (%ir-block.223): + ; GCN-NEXT: successors: %bb.17(0x40000000), %bb.18(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: SI_END_CF [[SI_IF7]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[S_MOV_B32_34:%[0-9]+]]:sreg_32 = S_MOV_B32 256 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], killed [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_34]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY93:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_12]], [[S_LOAD_DWORDX4_IMM]], [[COPY93]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY94:%[0-9]+]]:sreg_64 = COPY $exec + ; GCN-NEXT: [[COPY95:%[0-9]+]]:sreg_32 = COPY [[COPY94]].sub0 + ; GCN-NEXT: [[COPY96:%[0-9]+]]:sreg_32 = COPY [[COPY94]].sub1 + ; GCN-NEXT: [[V_MBCNT_LO_U32_B32_e64_8:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed [[COPY95]], [[V_MOV_B32_e32_12]], implicit $exec + ; GCN-NEXT: [[V_MBCNT_HI_U32_B32_e64_8:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 killed [[COPY96]], killed [[V_MBCNT_LO_U32_B32_e64_8]], implicit $exec + ; GCN-NEXT: [[V_SET_INACTIVE_B32_8:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY]], [[V_MOV_B32_e32_12]], implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp48:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_SET_INACTIVE_B32_8]], 273, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_48:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_SET_INACTIVE_B32_8]], killed [[V_MOV_B32_dpp48]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp49:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_ADD_U32_e64_48]], 274, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_49:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_48]], killed [[V_MOV_B32_dpp49]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp50:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_ADD_U32_e64_49]], 276, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_50:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_49]], killed [[V_MOV_B32_dpp50]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp51:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_ADD_U32_e64_50]], 280, 15, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_51:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_50]], killed [[V_MOV_B32_dpp51]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp52:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_ADD_U32_e64_51]], 322, 10, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_52:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_51]], killed [[V_MOV_B32_dpp52]], 0, implicit $exec + ; GCN-NEXT: [[V_MOV_B32_dpp53:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_12]], [[V_ADD_U32_e64_52]], 323, 12, 15, 0, implicit $exec + ; GCN-NEXT: [[V_ADD_U32_e64_53:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_52]], killed [[V_MOV_B32_dpp53]], 0, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_35:%[0-9]+]]:sreg_32 = S_MOV_B32 63 + ; GCN-NEXT: [[V_READLANE_B32_8:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_U32_e64_53]], killed [[S_MOV_B32_35]] + ; GCN-NEXT: early-clobber %41:sreg_32 = STRICT_WWM killed [[V_READLANE_B32_8]], implicit $exec + ; GCN-NEXT: [[V_CMP_EQ_U32_e64_8:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_8]], [[V_MOV_B32_e32_12]], implicit $exec + ; GCN-NEXT: [[SI_IF8:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_8]], %bb.18, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.17 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.17 (%ir-block.249): + ; GCN-NEXT: successors: %bb.18(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_36:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[COPY97:%[0-9]+]]:vgpr_32 = COPY %41 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY97]], killed [[V_MOV_B32_e32_13]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_36]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.18 (%ir-block.251): + ; GCN-NEXT: SI_END_CF [[SI_IF8]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GCN-NEXT: [[S_MOV_B32_37:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY23]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136 - ; GCN-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY24]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272 - ; GCN-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY26]], [[S_LOAD_DWORDX4_IMM]], [[COPY27]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[COPY98:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[COPY98]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_38:%[0-9]+]]:sreg_32 = S_MOV_B32 136 + ; GCN-NEXT: [[COPY99:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[COPY99]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_38]], 136, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_39:%[0-9]+]]:sreg_32 = S_MOV_B32 272 + ; GCN-NEXT: [[COPY100:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[COPY100]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_39]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32) on unknown-address + 272, align 1, addrspace 7) + ; GCN-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_37]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE6]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_39]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[COPY101:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: [[COPY102:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[COPY101]], [[S_LOAD_DWORDX4_IMM]], [[COPY102]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[V_MOV_B32_e32_14]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) - ; GCN-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288 - ; GCN-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY103:%[0-9]+]]:vreg_128 = COPY [[COPY65]] + ; GCN-NEXT: [[COPY104:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[COPY103]], [[COPY104]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_40:%[0-9]+]]:sreg_32 = S_MOV_B32 144 + ; GCN-NEXT: [[COPY105:%[0-9]+]]:vreg_128 = COPY [[COPY66]] + ; GCN-NEXT: [[COPY106:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[COPY105]], [[COPY106]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_40]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_41:%[0-9]+]]:sreg_32 = S_MOV_B32 288 + ; GCN-NEXT: [[COPY107:%[0-9]+]]:vreg_128 = COPY [[COPY67]] + ; GCN-NEXT: [[COPY108:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[COPY107]], [[COPY108]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_41]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 288, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE6]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_41]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY109:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: [[COPY110:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY109]], [[S_LOAD_DWORDX4_IMM]], [[COPY110]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY111:%[0-9]+]]:vreg_128 = COPY [[COPY70]] + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[COPY111]], [[V_MOV_B32_e32_14]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */ - ; GCN-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY33]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152 - ; GCN-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) - ; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304 - ; GCN-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY35]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] - ; GCN-NEXT: [[COPY37:%[0-9]+]]:sreg_32 = COPY [[COPY]] - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[COPY37]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) - ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY112:%[0-9]+]]:vreg_128 = COPY [[COPY73]] + ; GCN-NEXT: [[COPY113:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[COPY112]], [[COPY113]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_42:%[0-9]+]]:sreg_32 = S_MOV_B32 152 + ; GCN-NEXT: [[COPY114:%[0-9]+]]:vreg_128 = COPY [[COPY74]] + ; GCN-NEXT: [[COPY115:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[COPY114]], [[COPY115]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_42]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) + ; GCN-NEXT: [[S_MOV_B32_43:%[0-9]+]]:sreg_32 = S_MOV_B32 304 + ; GCN-NEXT: [[COPY116:%[0-9]+]]:vreg_128 = COPY [[COPY75]] + ; GCN-NEXT: [[COPY117:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[COPY116]], [[COPY117]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_43]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into unknown-address + 304, align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE6]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_43]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY118:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_37]] + ; GCN-NEXT: [[COPY119:%[0-9]+]]:sreg_32 = COPY [[COPY]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY118]], [[S_LOAD_DWORDX4_IMM]], [[COPY119]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: [[COPY120:%[0-9]+]]:vreg_128 = COPY [[COPY78]] + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[COPY120]], [[V_MOV_B32_e32_14]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_37]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) ; GCN-NEXT: S_ENDPGM 0 bb.0: %tmp0 = load <4 x i32>, ptr addrspace(6) %arg0, align 16, !invariant.load !0 Index: llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -9,14 +9,27 @@ ; CHECK-LABEL: add: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cbranch_execz .LBB0_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_add v1, v1, v2, s[0:1] glc +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_add_u32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -31,14 +44,27 @@ ; CHECK-LABEL: sub: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_sub v1, v1, v2, s[0:1] glc +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_sub_u32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -53,14 +79,26 @@ ; CHECK-LABEL: and: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB2_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_and v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_and_b32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -75,14 +113,28 @@ ; CHECK-LABEL: or: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB3_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_or v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, -1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_or_b32_e32 v0, s2, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -97,14 +149,29 @@ ; CHECK-LABEL: xor: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc +; CHECK-NEXT: s_cbranch_execz .LBB4_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: s_and_b32 s4, s4, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_xor v1, v1, v2, s[0:1] glc +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_xor_b32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -155,14 +222,27 @@ ; CHECK-LABEL: max_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB6_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_smax v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB6_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, 1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_max_i32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -212,14 +292,27 @@ ; CHECK-LABEL: min_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB8_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_smin v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB8_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, -2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_min_i32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -269,14 +362,28 @@ ; CHECK-LABEL: umax_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB10_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: global_atomic_umax v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_xor_b64 s[0:1], vcc, -1 +; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_max_u32_e32 v0, s2, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -326,14 +433,26 @@ ; CHECK-LABEL: umin_workgroup: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc +; CHECK-NEXT: s_cbranch_execz .LBB12_2 +; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1] +; CHECK-NEXT: global_atomic_umin v0, v0, v1, s[0:1] glc +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: v_mov_b32_e32 v3, s3 +; CHECK-NEXT: v_min_u32_e32 v0, s0, v0 +; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3] ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -616,15 +735,29 @@ define protected amdgpu_kernel void @buffer.atomic.add(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.add: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB23_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 offen glc +; CHECK-NEXT: .LBB23_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_add_u32_e32 v0, s2, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -638,15 +771,29 @@ define protected amdgpu_kernel void @buffer.atomic.sub(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.sub: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB24_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 offen glc +; CHECK-NEXT: .LBB24_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_sub_u32_e32 v0, s2, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -660,16 +807,30 @@ define protected amdgpu_kernel void @buffer.atomic.smin(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.smin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB25_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB25_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, -2 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_min_i32_e32 v0, s2, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -682,16 +843,30 @@ define protected amdgpu_kernel void @buffer.atomic.smax(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.smax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB26_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_bfrev_b32_e32 v0, 1 +; CHECK-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc +; CHECK-NEXT: v_max_i32_e32 v0, s2, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -704,15 +879,28 @@ define protected amdgpu_kernel void @buffer.atomic.umin(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.umin: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB27_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_min_u32_e32 v0, s2, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -726,16 +914,30 @@ define protected amdgpu_kernel void @buffer.atomic.umax(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.umax: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB28_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB28_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: v_max_u32_e32 v0, s4, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -748,15 +950,28 @@ define protected amdgpu_kernel void @buffer.atomic.and(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.and: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB29_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB29_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 1, -1, vcc +; CHECK-NEXT: v_and_b32_e32 v0, s2, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm @@ -770,16 +985,30 @@ define protected amdgpu_kernel void @buffer.atomic.or(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.or: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB30_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s8, s[0:1], 0x34 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c -; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_mov_b32_e32 v1, s8 ; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc +; CHECK-NEXT: .LBB30_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_xor_b64 s[2:3], vcc, -1 ; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: v_or_b32_e32 v0, s4, v0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) @@ -792,15 +1021,31 @@ define protected amdgpu_kernel void @buffer.atomic.xor(<4 x i32> inreg %rsrc, i32 %vindex, ptr addrspace(1) %q) { ; CHECK-LABEL: buffer.atomic.xor: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_mov_b64 s[4:5], exec +; CHECK-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; CHECK-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: ; implicit-def: $vgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_cbranch_execz .LBB31_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: s_load_dword s6, s[0:1], 0x34 +; CHECK-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24 +; CHECK-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; CHECK-NEXT: s_and_b32 s4, s4, 1 +; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: buffer_atomic_xor v1, v2, s[8:11], 0 offen glc +; CHECK-NEXT: .LBB31_2: +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_readfirstlane_b32 s2, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_xor_b32_e32 v0, s2, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/gds-allocation.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/gds-allocation.ll +++ llvm/test/CodeGen/AMDGPU/gds-allocation.ll @@ -10,16 +10,28 @@ define amdgpu_kernel void @alloc_lds_gds(ptr addrspace(1) %out) #1 { ; GCN-LABEL: alloc_lds_gds: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 5 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, 16 +; GCN-NEXT: s_mov_b64 s[0:1], exec ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 gds ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.1: +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: s_mul_i32 s0, s0, 5 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB0_2: ; GCN-NEXT: s_endpgm %gep.gds = getelementptr [4 x i32], ptr addrspace(2) @gds0, i32 0, i32 3 %val0 = atomicrmw add ptr addrspace(2) %gep.gds, i32 5 acq_rel @@ -32,18 +44,44 @@ define amdgpu_kernel void @alloc_lds_gds_align(ptr addrspace(1) %out) #1 { ; GCN-LABEL: alloc_lds_gds_align: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v0, 5 -; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, 16 +; GCN-NEXT: s_mov_b64 s[0:1], exec ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 gds +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 gds ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: buffer_wbinvl1 +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: v_mbcnt_lo_u32_b32 v1, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v1, s1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_2 +; GCN-NEXT: ; %bb.1: +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: s_mul_i32 s0, s0, 5 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:140 +; GCN-NEXT: ds_add_u32 v0, v1 offset:140 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB1_2: +; GCN-NEXT: s_or_b64 exec, exec, s[2:3] +; GCN-NEXT: s_mov_b64 s[0:1], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GCN-NEXT: s_cbranch_execz .LBB1_4 +; GCN-NEXT: ; %bb.3: +; GCN-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GCN-NEXT: s_mul_i32 s0, s0, 5 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: ds_add_u32 v1, v0 offset:12 +; GCN-NEXT: ds_add_u32 v0, v1 offset:12 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB1_4: ; GCN-NEXT: s_endpgm %gep.gds = getelementptr [4 x i32], ptr addrspace(2) @gds0, i32 0, i32 3 %val0 = atomicrmw add ptr addrspace(2) %gep.gds, i32 5 acq_rel Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -256,11 +256,14 @@ ; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU IR late optimizations +; GCN-O1-NEXT: AMDGPU atomic optimizations ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results ; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Code sinking ; GCN-O1-NEXT: Post-Dominator Tree Construction +; GCN-O1-NEXT: Cycle Info Analysis +; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: Unify divergent function exit nodes ; GCN-O1-NEXT: Lazy Value Information Analysis ; GCN-O1-NEXT: Lower SwitchInst's to branches @@ -556,11 +559,14 @@ ; GCN-O1-OPTS-NEXT: Cycle Info Analysis ; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations +; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations ; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Code sinking ; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Cycle Info Analysis +; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes ; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis ; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches @@ -864,11 +870,14 @@ ; GCN-O2-NEXT: Cycle Info Analysis ; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: AMDGPU IR late optimizations +; GCN-O2-NEXT: AMDGPU atomic optimizations ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Code sinking ; GCN-O2-NEXT: Post-Dominator Tree Construction +; GCN-O2-NEXT: Cycle Info Analysis +; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: Unify divergent function exit nodes ; GCN-O2-NEXT: Lazy Value Information Analysis ; GCN-O2-NEXT: Lower SwitchInst's to branches @@ -1185,11 +1194,14 @@ ; GCN-O3-NEXT: Cycle Info Analysis ; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: AMDGPU IR late optimizations +; GCN-O3-NEXT: AMDGPU atomic optimizations ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Code sinking ; GCN-O3-NEXT: Post-Dominator Tree Construction +; GCN-O3-NEXT: Cycle Info Analysis +; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: Unify divergent function exit nodes ; GCN-O3-NEXT: Lazy Value Information Analysis ; GCN-O3-NEXT: Lower SwitchInst's to branches Index: llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll +++ llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -573,12 +573,36 @@ ret void } -; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_no_alias_store: -; CGN: global_store_dword -; GCN: ds_add_u32 -; GCN: s_load_dword s -; GCN-NOT: global_load_dword -; GCN: global_store_dword +; GCN-LABEL: no_alias_atomic_rmw_then_no_alias_store: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_store_dword v0, v1, s[4:5] +; GCN-NEXT: v_mbcnt_hi_u32_b32 v1, s3, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB17_2 +; GCN-NEXT: ; %bb.1: +; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GCN-NEXT: s_mul_i32 s2, s2, 5 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: ds_add_u32 v0, v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: .LBB17_2: +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_barrier +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_dword v1, v0, s[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-NEXT: s_endpgm define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(1) noalias %noalias) { ; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store( ; CHECK-NEXT: entry: @@ -602,5 +626,6 @@ ret void } + declare void @llvm.amdgcn.s.barrier() declare void @llvm.amdgcn.wave.barrier()