Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1752,12 +1752,15 @@ if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) return false; + unsigned SubReg; + std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, + ValSize / 8); + const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && STI.useVGPRIndexMode(); MachineBasicBlock *BB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned SubReg = ValSize == 64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; if (IndexMode) { BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1380,6 +1380,25 @@ constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI); } +/// Utility function for pushing dynamic vector indexes with a constant offset +/// into waterwall loops. +static void reinsertVectorIndexAdd(MachineIRBuilder &B, + MachineInstr &IdxUseInstr, + unsigned OpIdx, + unsigned ConstOffset) { + MachineRegisterInfo &MRI = *B.getMRI(); + const LLT S32 = LLT::scalar(32); + Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); + B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); + + auto MaterializedOffset = B.buildConstant(S32, ConstOffset); + + auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); + MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); + IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); +} + void AMDGPURegisterBankInfo::applyMappingImpl( const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -1741,20 +1760,6 @@ ConstOffset > 0 && ConstOffset < SrcTy.getNumElements(); - // Re-insert the constant offset add inside the waterfall loop. - auto ReinsertIndexAdd = [=, &B, &MRI](MachineInstr &IdxUseInstr, - unsigned OpIdx) { - Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg(); - B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator()); - - auto MaterializedOffset = B.buildConstant(S32, ConstOffset); - - auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset); - MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank); - MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank); - IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0)); - }; - // Move the base register. We'll re-insert the add later. if (ShouldMoveIndexIntoLoop) MI.getOperand(2).setReg(BaseIdxReg); @@ -1781,8 +1786,9 @@ buildVCopy(B, DstReg, TmpReg); } + // Re-insert the constant offset add inside the waterfall loop. if (ShouldMoveIndexIntoLoop) - ReinsertIndexAdd(MI, 2); + reinsertVectorIndexAdd(B, MI, 2, ConstOffset); return; } @@ -1843,7 +1849,7 @@ } if (ShouldMoveIndexIntoLoop) - ReinsertIndexAdd(*IdxLo, 1); + reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); return; } @@ -1856,26 +1862,53 @@ assert(OpdMapper.getVRegs(0).empty()); assert(OpdMapper.getVRegs(3).empty()); + const RegisterBank *IdxBank = + OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; + if (substituteSimpleCopyRegs(OpdMapper, 1)) MRI.setType(MI.getOperand(1).getReg(), VecTy); + Register SrcReg = MI.getOperand(1).getReg(); + Register InsReg = MI.getOperand(2).getReg(); + LLT InsTy = MRI.getType(InsReg); + (void)InsTy; + + Register BaseIdxReg; + unsigned ConstOffset; + MachineInstr *OffsetDef; + std::tie(BaseIdxReg, ConstOffset, OffsetDef) = + AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg()); + + // See if the index is an add of a constant which will be foldable by moving + // the base register of the index later if this is going to be executed in a + // waterfall loop. This is essentially to reassociate the add of a constant + // with the readfirstlane. + bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank && + ConstOffset > 0 && + ConstOffset < VecTy.getNumElements(); + + // Move the base register. We'll re-insert the add later. + if (ShouldMoveIndexIntoLoop) + MI.getOperand(3).setReg(BaseIdxReg); + + if (InsRegs.empty()) { - applyDefaultMapping(OpdMapper); executeInWaterfallLoop(MI, MRI, { 3 }); + + // Re-insert the constant offset add inside the waterfall loop. + if (ShouldMoveIndexIntoLoop) { + MachineIRBuilder B(MI); + reinsertVectorIndexAdd(B, MI, 3, ConstOffset); + } + return; } - Register SrcReg = MI.getOperand(1).getReg(); - Register InsReg = MI.getOperand(2).getReg(); - Register IdxReg = MI.getOperand(3).getReg(); - LLT SrcTy = MRI.getType(SrcReg); - LLT InsTy = MRI.getType(InsReg); - (void)InsTy; assert(InsTy.getSizeInBits() == 64); const LLT S32 = LLT::scalar(32); - LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32); MachineIRBuilder B(MI); auto CastSrc = B.buildBitcast(Vec32, SrcReg); @@ -1888,7 +1921,7 @@ MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). - auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxLo = B.buildShl(S32, BaseIdxReg, One); auto IdxHi = B.buildAdd(S32, IdxLo, One); auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); @@ -1919,6 +1952,11 @@ executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), OpsToWaterfall, MRI); + + // Re-insert the constant offset add inside the waterfall loop. + if (ShouldMoveIndexIntoLoop) + reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset); + return; } case AMDGPU::G_INTRINSIC: { Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -747,10 +747,9 @@ ; GPRIDX-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill ; GPRIDX-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s6, v2 +; GPRIDX-NEXT: s_lshl_b32 s7, s6, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 -; GPRIDX-NEXT: s_lshl_b32 s6, s6, 1 -; GPRIDX-NEXT: s_add_u32 s7, s6, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(DST) +; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v34, v18 ; GPRIDX-NEXT: v_mov_b32_e32 v33, v17 ; GPRIDX-NEXT: v_mov_b32_e32 v32, v16 @@ -770,7 +769,7 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v19, v0 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v19, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -792,6 +791,7 @@ ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_mov_b32 s8, 0 +; MOVREL-NEXT: s_mov_b32 s19, 0x40200000 ; MOVREL-NEXT: s_mov_b64 s[4:5], 1.0 ; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0 ; MOVREL-NEXT: s_mov_b32 s9, 0x40080000 @@ -802,63 +802,59 @@ ; MOVREL-NEXT: s_mov_b32 s14, s8 ; MOVREL-NEXT: s_mov_b32 s17, 0x401c0000 ; MOVREL-NEXT: s_mov_b32 s16, s8 -; MOVREL-NEXT: s_mov_b32 s19, 0x40200000 ; MOVREL-NEXT: s_mov_b32 s18, s8 -; MOVREL-NEXT: v_mov_b32_e32 v3, s4 -; MOVREL-NEXT: v_mov_b32_e32 v4, s5 -; MOVREL-NEXT: v_mov_b32_e32 v5, s6 -; MOVREL-NEXT: v_mov_b32_e32 v6, s7 -; MOVREL-NEXT: v_mov_b32_e32 v7, s8 -; MOVREL-NEXT: v_mov_b32_e32 v8, s9 -; MOVREL-NEXT: v_mov_b32_e32 v9, s10 -; MOVREL-NEXT: v_mov_b32_e32 v10, s11 -; MOVREL-NEXT: v_mov_b32_e32 v11, s12 -; MOVREL-NEXT: v_mov_b32_e32 v12, s13 -; MOVREL-NEXT: v_mov_b32_e32 v13, s14 -; MOVREL-NEXT: v_mov_b32_e32 v14, s15 -; MOVREL-NEXT: v_mov_b32_e32 v15, s16 -; MOVREL-NEXT: v_mov_b32_e32 v16, s17 -; MOVREL-NEXT: v_mov_b32_e32 v17, s18 -; MOVREL-NEXT: v_mov_b32_e32 v18, s19 -; MOVREL-NEXT: s_mov_b32 s4, exec_lo ; MOVREL-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; MOVREL-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; MOVREL-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill +; MOVREL-NEXT: v_mov_b32_e32 v34, s19 +; MOVREL-NEXT: v_mov_b32_e32 v33, s18 +; MOVREL-NEXT: v_mov_b32_e32 v32, s17 +; MOVREL-NEXT: v_mov_b32_e32 v31, s16 +; MOVREL-NEXT: v_mov_b32_e32 v30, s15 +; MOVREL-NEXT: v_mov_b32_e32 v29, s14 +; MOVREL-NEXT: v_mov_b32_e32 v28, s13 +; MOVREL-NEXT: v_mov_b32_e32 v27, s12 +; MOVREL-NEXT: v_mov_b32_e32 v26, s11 +; MOVREL-NEXT: v_mov_b32_e32 v25, s10 +; MOVREL-NEXT: v_mov_b32_e32 v24, s9 +; MOVREL-NEXT: v_mov_b32_e32 v23, s8 +; MOVREL-NEXT: v_mov_b32_e32 v22, s7 +; MOVREL-NEXT: v_mov_b32_e32 v21, s6 +; MOVREL-NEXT: v_mov_b32_e32 v20, s5 +; MOVREL-NEXT: v_mov_b32_e32 v19, s4 +; MOVREL-NEXT: s_mov_b32 s4, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 ; MOVREL-NEXT: v_readfirstlane_b32 s5, v2 -; MOVREL-NEXT: v_mov_b32_e32 v34, v18 -; MOVREL-NEXT: v_mov_b32_e32 v19, v3 -; MOVREL-NEXT: v_mov_b32_e32 v33, v17 -; MOVREL-NEXT: v_mov_b32_e32 v32, v16 -; MOVREL-NEXT: s_lshl_b32 s6, s5, 1 +; MOVREL-NEXT: v_mov_b32_e32 v3, v19 +; MOVREL-NEXT: v_mov_b32_e32 v4, v20 +; MOVREL-NEXT: v_mov_b32_e32 v5, v21 +; MOVREL-NEXT: v_mov_b32_e32 v6, v22 +; MOVREL-NEXT: s_lshl_b32 m0, s5, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 -; MOVREL-NEXT: v_mov_b32_e32 v31, v15 -; MOVREL-NEXT: v_mov_b32_e32 v30, v14 -; MOVREL-NEXT: v_mov_b32_e32 v29, v13 -; MOVREL-NEXT: s_add_u32 s5, s6, 1 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: v_mov_b32_e32 v28, v12 -; MOVREL-NEXT: v_mov_b32_e32 v27, v11 -; MOVREL-NEXT: v_mov_b32_e32 v26, v10 -; MOVREL-NEXT: v_mov_b32_e32 v25, v9 -; MOVREL-NEXT: v_mov_b32_e32 v24, v8 -; MOVREL-NEXT: v_mov_b32_e32 v23, v7 -; MOVREL-NEXT: v_mov_b32_e32 v22, v6 -; MOVREL-NEXT: v_mov_b32_e32 v21, v5 -; MOVREL-NEXT: v_mov_b32_e32 v20, v4 -; MOVREL-NEXT: v_movreld_b32_e32 v19, v0 -; MOVREL-NEXT: s_mov_b32 m0, s5 -; MOVREL-NEXT: v_movreld_b32_e32 v19, v1 +; MOVREL-NEXT: v_mov_b32_e32 v7, v23 +; MOVREL-NEXT: v_mov_b32_e32 v8, v24 +; MOVREL-NEXT: v_mov_b32_e32 v9, v25 +; MOVREL-NEXT: v_mov_b32_e32 v10, v26 +; MOVREL-NEXT: v_mov_b32_e32 v11, v27 +; MOVREL-NEXT: v_mov_b32_e32 v12, v28 +; MOVREL-NEXT: v_mov_b32_e32 v13, v29 +; MOVREL-NEXT: v_mov_b32_e32 v14, v30 +; MOVREL-NEXT: v_mov_b32_e32 v15, v31 +; MOVREL-NEXT: v_mov_b32_e32 v16, v32 +; MOVREL-NEXT: v_mov_b32_e32 v17, v33 +; MOVREL-NEXT: v_mov_b32_e32 v18, v34 +; MOVREL-NEXT: v_movreld_b32_e32 v3, v0 +; MOVREL-NEXT: v_movreld_b32_e32 v4, v1 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB13_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s4 -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off ; MOVREL-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload ; MOVREL-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; MOVREL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -916,10 +912,9 @@ ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB14_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v0 +; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 -; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 -; GPRIDX-NEXT: s_add_u32 s3, s2, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v32, v16 ; GPRIDX-NEXT: v_mov_b32_e32 v31, v15 ; GPRIDX-NEXT: v_mov_b32_e32 v30, v14 @@ -939,7 +934,7 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v17, s19 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -970,58 +965,55 @@ ; MOVREL-NEXT: s_mov_b32 s10, s12 ; MOVREL-NEXT: s_mov_b32 s12, s14 ; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: v_mov_b32_e32 v16, s15 -; MOVREL-NEXT: v_mov_b32_e32 v14, s13 -; MOVREL-NEXT: v_mov_b32_e32 v12, s11 -; MOVREL-NEXT: v_mov_b32_e32 v13, s12 -; MOVREL-NEXT: v_mov_b32_e32 v15, s14 -; MOVREL-NEXT: v_mov_b32_e32 v11, s10 -; MOVREL-NEXT: v_mov_b32_e32 v10, s9 -; MOVREL-NEXT: v_mov_b32_e32 v9, s8 -; MOVREL-NEXT: v_mov_b32_e32 v8, s7 -; MOVREL-NEXT: v_mov_b32_e32 v7, s6 -; MOVREL-NEXT: v_mov_b32_e32 v6, s5 -; MOVREL-NEXT: v_mov_b32_e32 v5, s4 -; MOVREL-NEXT: v_mov_b32_e32 v4, s3 -; MOVREL-NEXT: v_mov_b32_e32 v3, s2 -; MOVREL-NEXT: v_mov_b32_e32 v2, s1 -; MOVREL-NEXT: v_mov_b32_e32 v1, s0 +; MOVREL-NEXT: v_mov_b32_e32 v32, s15 +; MOVREL-NEXT: v_mov_b32_e32 v30, s13 +; MOVREL-NEXT: v_mov_b32_e32 v28, s11 +; MOVREL-NEXT: v_mov_b32_e32 v29, s12 +; MOVREL-NEXT: v_mov_b32_e32 v31, s14 +; MOVREL-NEXT: v_mov_b32_e32 v27, s10 +; MOVREL-NEXT: v_mov_b32_e32 v26, s9 +; MOVREL-NEXT: v_mov_b32_e32 v25, s8 +; MOVREL-NEXT: v_mov_b32_e32 v24, s7 +; MOVREL-NEXT: v_mov_b32_e32 v23, s6 +; MOVREL-NEXT: v_mov_b32_e32 v22, s5 +; MOVREL-NEXT: v_mov_b32_e32 v21, s4 +; MOVREL-NEXT: v_mov_b32_e32 v20, s3 +; MOVREL-NEXT: v_mov_b32_e32 v19, s2 +; MOVREL-NEXT: v_mov_b32_e32 v18, s1 +; MOVREL-NEXT: v_mov_b32_e32 v17, s0 ; MOVREL-NEXT: s_mov_b32 s0, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB14_1: ; =>This Inner Loop Header: Depth=1 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 -; MOVREL-NEXT: v_mov_b32_e32 v32, v16 -; MOVREL-NEXT: v_mov_b32_e32 v17, v1 -; MOVREL-NEXT: v_mov_b32_e32 v31, v15 -; MOVREL-NEXT: v_mov_b32_e32 v30, v14 -; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: v_mov_b32_e32 v1, v17 +; MOVREL-NEXT: v_mov_b32_e32 v2, v18 +; MOVREL-NEXT: v_mov_b32_e32 v3, v19 +; MOVREL-NEXT: v_mov_b32_e32 v4, v20 +; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0 -; MOVREL-NEXT: v_mov_b32_e32 v29, v13 -; MOVREL-NEXT: v_mov_b32_e32 v28, v12 -; MOVREL-NEXT: v_mov_b32_e32 v27, v11 -; MOVREL-NEXT: s_add_u32 s1, s2, 1 -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: v_mov_b32_e32 v26, v10 -; MOVREL-NEXT: v_mov_b32_e32 v25, v9 -; MOVREL-NEXT: v_mov_b32_e32 v24, v8 -; MOVREL-NEXT: v_mov_b32_e32 v23, v7 -; MOVREL-NEXT: v_mov_b32_e32 v22, v6 -; MOVREL-NEXT: v_mov_b32_e32 v21, v5 -; MOVREL-NEXT: v_mov_b32_e32 v20, v4 -; MOVREL-NEXT: v_mov_b32_e32 v19, v3 -; MOVREL-NEXT: v_mov_b32_e32 v18, v2 -; MOVREL-NEXT: v_movreld_b32_e32 v17, s18 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v17, s19 +; MOVREL-NEXT: v_mov_b32_e32 v5, v21 +; MOVREL-NEXT: v_mov_b32_e32 v6, v22 +; MOVREL-NEXT: v_mov_b32_e32 v7, v23 +; MOVREL-NEXT: v_mov_b32_e32 v8, v24 +; MOVREL-NEXT: v_mov_b32_e32 v9, v25 +; MOVREL-NEXT: v_mov_b32_e32 v10, v26 +; MOVREL-NEXT: v_mov_b32_e32 v11, v27 +; MOVREL-NEXT: v_mov_b32_e32 v12, v28 +; MOVREL-NEXT: v_mov_b32_e32 v13, v29 +; MOVREL-NEXT: v_mov_b32_e32 v14, v30 +; MOVREL-NEXT: v_mov_b32_e32 v15, v31 +; MOVREL-NEXT: v_mov_b32_e32 v16, v32 +; MOVREL-NEXT: v_movreld_b32_e32 v1, s18 +; MOVREL-NEXT: v_movreld_b32_e32 v2, s19 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB14_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[17:20], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[21:24], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[25:28], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[29:32], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[1:4], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[5:8], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[9:12], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[13:16], off ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1072,12 +1064,11 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 ; GPRIDX-NEXT: s_lshl_b32 s0, s18, 1 -; GPRIDX-NEXT: s_add_u32 s1, s0, 1 ; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v2, v0 ; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v2, v1 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v3, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[6:9], off @@ -1103,12 +1094,12 @@ ; MOVREL-NEXT: s_mov_b32 s10, s12 ; MOVREL-NEXT: s_mov_b32 s12, s14 ; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: s_lshl_b32 s16, s18, 1 ; MOVREL-NEXT: v_mov_b32_e32 v17, s15 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 +; MOVREL-NEXT: s_lshl_b32 m0, s18, 1 ; MOVREL-NEXT: v_mov_b32_e32 v15, s13 -; MOVREL-NEXT: v_mov_b32_e32 v14, s12 ; MOVREL-NEXT: v_mov_b32_e32 v16, s14 -; MOVREL-NEXT: s_mov_b32 m0, s16 +; MOVREL-NEXT: v_mov_b32_e32 v14, s12 ; MOVREL-NEXT: v_mov_b32_e32 v13, s11 ; MOVREL-NEXT: v_mov_b32_e32 v12, s10 ; MOVREL-NEXT: v_mov_b32_e32 v11, s9 @@ -1120,15 +1111,12 @@ ; MOVREL-NEXT: v_mov_b32_e32 v5, s3 ; MOVREL-NEXT: v_mov_b32_e32 v4, s2 ; MOVREL-NEXT: v_mov_b32_e32 v3, s1 -; MOVREL-NEXT: v_mov_b32_e32 v2, s0 -; MOVREL-NEXT: s_add_u32 s0, s16, 1 -; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 -; MOVREL-NEXT: s_mov_b32 m0, s0 -; MOVREL-NEXT: v_movreld_b32_e32 v2, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v3, v1 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off ; MOVREL-NEXT: s_endpgm entry: @@ -1148,12 +1136,11 @@ ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_s_s: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_lshl_b32 s0, s4, 1 -; GPRIDX-NEXT: s_add_u32 s1, s0, 1 ; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 ; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, s3 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v1, s3 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off @@ -1163,13 +1150,10 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_lshl_b32 s0, s4, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s4, 1 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: s_mov_b32 m0, s0 -; MOVREL-NEXT: s_add_u32 s0, s0, 1 ; MOVREL-NEXT: v_movreld_b32_e32 v0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s0 -; MOVREL-NEXT: v_movreld_b32_e32 v0, s3 +; MOVREL-NEXT: v_movreld_b32_e32 v1, s3 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off @@ -1226,10 +1210,9 @@ ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB17_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 +; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 -; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 -; GPRIDX-NEXT: s_add_u32 s3, s2, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v34, v18 ; GPRIDX-NEXT: v_mov_b32_e32 v33, v17 ; GPRIDX-NEXT: v_mov_b32_e32 v32, v16 @@ -1249,7 +1232,7 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v19, v0 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v19, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v1 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -1280,58 +1263,55 @@ ; MOVREL-NEXT: s_mov_b32 s10, s12 ; MOVREL-NEXT: s_mov_b32 s12, s14 ; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: v_mov_b32_e32 v18, s15 -; MOVREL-NEXT: v_mov_b32_e32 v16, s13 -; MOVREL-NEXT: v_mov_b32_e32 v14, s11 -; MOVREL-NEXT: v_mov_b32_e32 v15, s12 -; MOVREL-NEXT: v_mov_b32_e32 v17, s14 -; MOVREL-NEXT: v_mov_b32_e32 v13, s10 -; MOVREL-NEXT: v_mov_b32_e32 v12, s9 -; MOVREL-NEXT: v_mov_b32_e32 v11, s8 -; MOVREL-NEXT: v_mov_b32_e32 v10, s7 -; MOVREL-NEXT: v_mov_b32_e32 v9, s6 -; MOVREL-NEXT: v_mov_b32_e32 v8, s5 -; MOVREL-NEXT: v_mov_b32_e32 v7, s4 -; MOVREL-NEXT: v_mov_b32_e32 v6, s3 -; MOVREL-NEXT: v_mov_b32_e32 v5, s2 -; MOVREL-NEXT: v_mov_b32_e32 v4, s1 -; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: v_mov_b32_e32 v34, s15 +; MOVREL-NEXT: v_mov_b32_e32 v32, s13 +; MOVREL-NEXT: v_mov_b32_e32 v30, s11 +; MOVREL-NEXT: v_mov_b32_e32 v31, s12 +; MOVREL-NEXT: v_mov_b32_e32 v33, s14 +; MOVREL-NEXT: v_mov_b32_e32 v29, s10 +; MOVREL-NEXT: v_mov_b32_e32 v28, s9 +; MOVREL-NEXT: v_mov_b32_e32 v27, s8 +; MOVREL-NEXT: v_mov_b32_e32 v26, s7 +; MOVREL-NEXT: v_mov_b32_e32 v25, s6 +; MOVREL-NEXT: v_mov_b32_e32 v24, s5 +; MOVREL-NEXT: v_mov_b32_e32 v23, s4 +; MOVREL-NEXT: v_mov_b32_e32 v22, s3 +; MOVREL-NEXT: v_mov_b32_e32 v21, s2 +; MOVREL-NEXT: v_mov_b32_e32 v20, s1 +; MOVREL-NEXT: v_mov_b32_e32 v19, s0 ; MOVREL-NEXT: s_mov_b32 s0, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB17_1: ; =>This Inner Loop Header: Depth=1 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v2 -; MOVREL-NEXT: v_mov_b32_e32 v34, v18 -; MOVREL-NEXT: v_mov_b32_e32 v19, v3 -; MOVREL-NEXT: v_mov_b32_e32 v33, v17 -; MOVREL-NEXT: v_mov_b32_e32 v32, v16 -; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: v_mov_b32_e32 v3, v19 +; MOVREL-NEXT: v_mov_b32_e32 v4, v20 +; MOVREL-NEXT: v_mov_b32_e32 v5, v21 +; MOVREL-NEXT: v_mov_b32_e32 v6, v22 +; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v2 -; MOVREL-NEXT: v_mov_b32_e32 v31, v15 -; MOVREL-NEXT: v_mov_b32_e32 v30, v14 -; MOVREL-NEXT: v_mov_b32_e32 v29, v13 -; MOVREL-NEXT: s_add_u32 s1, s2, 1 -; MOVREL-NEXT: s_mov_b32 m0, s2 -; MOVREL-NEXT: v_mov_b32_e32 v28, v12 -; MOVREL-NEXT: v_mov_b32_e32 v27, v11 -; MOVREL-NEXT: v_mov_b32_e32 v26, v10 -; MOVREL-NEXT: v_mov_b32_e32 v25, v9 -; MOVREL-NEXT: v_mov_b32_e32 v24, v8 -; MOVREL-NEXT: v_mov_b32_e32 v23, v7 -; MOVREL-NEXT: v_mov_b32_e32 v22, v6 -; MOVREL-NEXT: v_mov_b32_e32 v21, v5 -; MOVREL-NEXT: v_mov_b32_e32 v20, v4 -; MOVREL-NEXT: v_movreld_b32_e32 v19, v0 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v19, v1 +; MOVREL-NEXT: v_mov_b32_e32 v7, v23 +; MOVREL-NEXT: v_mov_b32_e32 v8, v24 +; MOVREL-NEXT: v_mov_b32_e32 v9, v25 +; MOVREL-NEXT: v_mov_b32_e32 v10, v26 +; MOVREL-NEXT: v_mov_b32_e32 v11, v27 +; MOVREL-NEXT: v_mov_b32_e32 v12, v28 +; MOVREL-NEXT: v_mov_b32_e32 v13, v29 +; MOVREL-NEXT: v_mov_b32_e32 v14, v30 +; MOVREL-NEXT: v_mov_b32_e32 v15, v31 +; MOVREL-NEXT: v_mov_b32_e32 v16, v32 +; MOVREL-NEXT: v_mov_b32_e32 v17, v33 +; MOVREL-NEXT: v_mov_b32_e32 v18, v34 +; MOVREL-NEXT: v_movreld_b32_e32 v3, v0 +; MOVREL-NEXT: v_movreld_b32_e32 v4, v1 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB17_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1352,10 +1332,9 @@ ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB18_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s4, v16 +; GPRIDX-NEXT: s_lshl_b32 s5, s4, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 -; GPRIDX-NEXT: s_lshl_b32 s4, s4, 1 -; GPRIDX-NEXT: s_add_u32 s5, s4, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GPRIDX-NEXT: s_set_gpr_idx_on s5, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v32, v15 ; GPRIDX-NEXT: v_mov_b32_e32 v31, v14 ; GPRIDX-NEXT: v_mov_b32_e32 v30, v13 @@ -1375,7 +1354,7 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v17, s2 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s5, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v17, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s3 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -1398,13 +1377,11 @@ ; MOVREL-NEXT: v_mov_b32_e32 v17, v0 ; MOVREL-NEXT: v_mov_b32_e32 v31, v14 ; MOVREL-NEXT: v_mov_b32_e32 v30, v13 -; MOVREL-NEXT: s_lshl_b32 s4, s1, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v16 ; MOVREL-NEXT: v_mov_b32_e32 v29, v12 ; MOVREL-NEXT: v_mov_b32_e32 v28, v11 ; MOVREL-NEXT: v_mov_b32_e32 v27, v10 -; MOVREL-NEXT: s_add_u32 s1, s4, 1 -; MOVREL-NEXT: s_mov_b32 m0, s4 ; MOVREL-NEXT: v_mov_b32_e32 v26, v9 ; MOVREL-NEXT: v_mov_b32_e32 v25, v8 ; MOVREL-NEXT: v_mov_b32_e32 v24, v7 @@ -1415,8 +1392,7 @@ ; MOVREL-NEXT: v_mov_b32_e32 v19, v2 ; MOVREL-NEXT: v_mov_b32_e32 v18, v1 ; MOVREL-NEXT: v_movreld_b32_e32 v17, s2 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v17, s3 +; MOVREL-NEXT: v_movreld_b32_e32 v18, s3 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB18_1 @@ -1444,12 +1420,11 @@ ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_s: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1 -; GPRIDX-NEXT: s_add_u32 s1, s0, 1 ; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v0, v16 ; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v17 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v1, v17 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off @@ -1459,13 +1434,10 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_lshl_b32 s0, s2, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s2, 1 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: s_mov_b32 m0, s0 -; MOVREL-NEXT: s_add_u32 s0, s0, 1 ; MOVREL-NEXT: v_movreld_b32_e32 v0, v16 -; MOVREL-NEXT: s_mov_b32 m0, s0 -; MOVREL-NEXT: v_movreld_b32_e32 v0, v17 +; MOVREL-NEXT: v_movreld_b32_e32 v1, v17 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off @@ -1490,10 +1462,9 @@ ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB20_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v18 +; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18 -; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 -; GPRIDX-NEXT: s_add_u32 s3, s2, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v34, v15 ; GPRIDX-NEXT: v_mov_b32_e32 v33, v14 ; GPRIDX-NEXT: v_mov_b32_e32 v32, v13 @@ -1513,7 +1484,7 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v19, v16 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v19, v17 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v17 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -1536,13 +1507,11 @@ ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 ; MOVREL-NEXT: v_mov_b32_e32 v33, v14 ; MOVREL-NEXT: v_mov_b32_e32 v32, v13 -; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v18 ; MOVREL-NEXT: v_mov_b32_e32 v31, v12 ; MOVREL-NEXT: v_mov_b32_e32 v30, v11 ; MOVREL-NEXT: v_mov_b32_e32 v29, v10 -; MOVREL-NEXT: s_add_u32 s1, s2, 1 -; MOVREL-NEXT: s_mov_b32 m0, s2 ; MOVREL-NEXT: v_mov_b32_e32 v28, v9 ; MOVREL-NEXT: v_mov_b32_e32 v27, v8 ; MOVREL-NEXT: v_mov_b32_e32 v26, v7 @@ -1553,8 +1522,7 @@ ; MOVREL-NEXT: v_mov_b32_e32 v21, v2 ; MOVREL-NEXT: v_mov_b32_e32 v20, v1 ; MOVREL-NEXT: v_movreld_b32_e32 v19, v16 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v19, v17 +; MOVREL-NEXT: v_movreld_b32_e32 v20, v17 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB20_1 @@ -1783,9 +1751,9 @@ ; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_add_u32 m0, s11, 1 +; GPRIDX-NEXT: s_mov_b32 m0, s11 ; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s0, s10 +; GPRIDX-NEXT: s_movreld_b32 s1, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 @@ -1798,16 +1766,16 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_s_add_1: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_add_u32 m0, s11, 1 ; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 m0, s11 +; MOVREL-NEXT: s_mov_b32 s0, s2 ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movreld_b32 s0, s10 +; MOVREL-NEXT: s_movreld_b32 s1, s10 ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 @@ -1835,9 +1803,9 @@ ; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_add_u32 m0, s11, 7 +; GPRIDX-NEXT: s_mov_b32 m0, s11 ; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s0, s10 +; GPRIDX-NEXT: s_movreld_b32 s7, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 @@ -1850,16 +1818,16 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_s_add_7: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_add_u32 m0, s11, 7 ; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movreld_b32 s0, s10 +; MOVREL-NEXT: s_mov_b32 m0, s11 +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_movreld_b32 s7, s10 ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 @@ -1879,68 +1847,66 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_1(<8 x float> %vec, float %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: v_add_u32_e32 v17, 1, v9 ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB29_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v17 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v17 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 ; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v16, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v17, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v16, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v15, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v14, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v13, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v12, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v10, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v8 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc ; GPRIDX-NEXT: s_cbranch_execnz BB29_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: v_add_nc_u32_e32 v17, 1, v9 ; MOVREL-NEXT: s_mov_b32 s0, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB29_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v17 -; MOVREL-NEXT: v_mov_b32_e32 v16, v7 -; MOVREL-NEXT: v_mov_b32_e32 v9, v0 -; MOVREL-NEXT: v_mov_b32_e32 v15, v6 -; MOVREL-NEXT: v_mov_b32_e32 v14, v5 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v17 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v9 +; MOVREL-NEXT: v_mov_b32_e32 v17, v7 +; MOVREL-NEXT: v_mov_b32_e32 v11, v1 +; MOVREL-NEXT: v_mov_b32_e32 v16, v6 +; MOVREL-NEXT: v_mov_b32_e32 v15, v5 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9 ; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v13, v4 -; MOVREL-NEXT: v_mov_b32_e32 v12, v3 -; MOVREL-NEXT: v_mov_b32_e32 v11, v2 -; MOVREL-NEXT: v_mov_b32_e32 v10, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v9, v8 +; MOVREL-NEXT: v_mov_b32_e32 v14, v4 +; MOVREL-NEXT: v_mov_b32_e32 v13, v3 +; MOVREL-NEXT: v_mov_b32_e32 v12, v2 +; MOVREL-NEXT: v_mov_b32_e32 v10, v0 +; MOVREL-NEXT: v_movreld_b32_e32 v11, v8 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB29_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: v_mov_b32_e32 v1, v10 -; MOVREL-NEXT: v_mov_b32_e32 v2, v11 -; MOVREL-NEXT: v_mov_b32_e32 v3, v12 -; MOVREL-NEXT: v_mov_b32_e32 v4, v13 -; MOVREL-NEXT: v_mov_b32_e32 v5, v14 -; MOVREL-NEXT: v_mov_b32_e32 v6, v15 -; MOVREL-NEXT: v_mov_b32_e32 v7, v16 +; MOVREL-NEXT: v_mov_b32_e32 v0, v10 +; MOVREL-NEXT: v_mov_b32_e32 v1, v11 +; MOVREL-NEXT: v_mov_b32_e32 v2, v12 +; MOVREL-NEXT: v_mov_b32_e32 v3, v13 +; MOVREL-NEXT: v_mov_b32_e32 v4, v14 +; MOVREL-NEXT: v_mov_b32_e32 v5, v15 +; MOVREL-NEXT: v_mov_b32_e32 v6, v16 +; MOVREL-NEXT: v_mov_b32_e32 v7, v17 ; MOVREL-NEXT: ; return to shader part epilog entry: %idx.add = add i32 %idx, 1 @@ -1951,68 +1917,66 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_7(<8 x float> %vec, float %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: v_add_u32_e32 v17, 7, v9 ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB30_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v17 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v17 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 ; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v16, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v17, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v16, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v15, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v14, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v13, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v12, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v10, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v17, v8 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc ; GPRIDX-NEXT: s_cbranch_execnz BB30_1 ; GPRIDX-NEXT: ; %bb.2: ; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: v_add_nc_u32_e32 v17, 7, v9 ; MOVREL-NEXT: s_mov_b32 s0, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB30_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v17 -; MOVREL-NEXT: v_mov_b32_e32 v16, v7 -; MOVREL-NEXT: v_mov_b32_e32 v9, v0 -; MOVREL-NEXT: v_mov_b32_e32 v15, v6 -; MOVREL-NEXT: v_mov_b32_e32 v14, v5 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v17 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v9 +; MOVREL-NEXT: v_mov_b32_e32 v17, v7 +; MOVREL-NEXT: v_mov_b32_e32 v16, v6 +; MOVREL-NEXT: v_mov_b32_e32 v15, v5 +; MOVREL-NEXT: v_mov_b32_e32 v14, v4 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9 ; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v13, v4 -; MOVREL-NEXT: v_mov_b32_e32 v12, v3 -; MOVREL-NEXT: v_mov_b32_e32 v11, v2 -; MOVREL-NEXT: v_mov_b32_e32 v10, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v9, v8 +; MOVREL-NEXT: v_mov_b32_e32 v13, v3 +; MOVREL-NEXT: v_mov_b32_e32 v12, v2 +; MOVREL-NEXT: v_mov_b32_e32 v11, v1 +; MOVREL-NEXT: v_mov_b32_e32 v10, v0 +; MOVREL-NEXT: v_movreld_b32_e32 v17, v8 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB30_1 ; MOVREL-NEXT: ; %bb.2: ; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: v_mov_b32_e32 v1, v10 -; MOVREL-NEXT: v_mov_b32_e32 v2, v11 -; MOVREL-NEXT: v_mov_b32_e32 v3, v12 -; MOVREL-NEXT: v_mov_b32_e32 v4, v13 -; MOVREL-NEXT: v_mov_b32_e32 v5, v14 -; MOVREL-NEXT: v_mov_b32_e32 v6, v15 -; MOVREL-NEXT: v_mov_b32_e32 v7, v16 +; MOVREL-NEXT: v_mov_b32_e32 v0, v10 +; MOVREL-NEXT: v_mov_b32_e32 v1, v11 +; MOVREL-NEXT: v_mov_b32_e32 v2, v12 +; MOVREL-NEXT: v_mov_b32_e32 v3, v13 +; MOVREL-NEXT: v_mov_b32_e32 v4, v14 +; MOVREL-NEXT: v_mov_b32_e32 v5, v15 +; MOVREL-NEXT: v_mov_b32_e32 v6, v16 +; MOVREL-NEXT: v_mov_b32_e32 v7, v17 ; MOVREL-NEXT: ; return to shader part epilog entry: %idx.add = add i32 %idx, 7 @@ -2039,9 +2003,9 @@ ; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: s_mov_b32 s15, s17 -; GPRIDX-NEXT: s_add_u32 m0, s20, 1 +; GPRIDX-NEXT: s_mov_b32 m0, s20 ; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b64 s[0:1], s[18:19] +; GPRIDX-NEXT: s_movreld_b64 s[2:3], s[18:19] ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 @@ -2071,9 +2035,9 @@ ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_mov_b32 s0, s2 ; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_add_u32 m0, s20, 1 ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 m0, s20 ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 @@ -2086,7 +2050,7 @@ ; MOVREL-NEXT: s_mov_b32 s13, s15 ; MOVREL-NEXT: s_mov_b32 s14, s16 ; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_movreld_b64 s[0:1], s[18:19] +; MOVREL-NEXT: s_movreld_b64 s[2:3], s[18:19] ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: v_mov_b32_e32 v4, s4 ; MOVREL-NEXT: v_mov_b32_e32 v8, s8 @@ -2126,14 +2090,13 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: v_add_u32_e32 v18, 1, v18 ; GPRIDX-NEXT: s_mov_b64 s[0:1], exec ; GPRIDX-NEXT: BB32_1: ; =>This Inner Loop Header: Depth=1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v18 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18 -; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 ; GPRIDX-NEXT: s_add_u32 s3, s2, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: s_lshl_b32 s3, s3, 1 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18 +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) ; GPRIDX-NEXT: v_mov_b32_e32 v34, v15 ; GPRIDX-NEXT: v_mov_b32_e32 v33, v14 ; GPRIDX-NEXT: v_mov_b32_e32 v32, v13 @@ -2153,7 +2116,7 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v19, v16 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v19, v17 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v17 ; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc ; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc @@ -2168,7 +2131,6 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18 ; MOVREL-NEXT: s_mov_b32 s0, exec_lo ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: BB32_1: ; =>This Inner Loop Header: Depth=1 @@ -2177,13 +2139,12 @@ ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 ; MOVREL-NEXT: v_mov_b32_e32 v33, v14 ; MOVREL-NEXT: v_mov_b32_e32 v32, v13 -; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: s_add_u32 s2, s1, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v18 ; MOVREL-NEXT: v_mov_b32_e32 v31, v12 ; MOVREL-NEXT: v_mov_b32_e32 v30, v11 ; MOVREL-NEXT: v_mov_b32_e32 v29, v10 -; MOVREL-NEXT: s_add_u32 s1, s2, 1 -; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: s_lshl_b32 m0, s2, 1 ; MOVREL-NEXT: v_mov_b32_e32 v28, v9 ; MOVREL-NEXT: v_mov_b32_e32 v27, v8 ; MOVREL-NEXT: v_mov_b32_e32 v26, v7 @@ -2194,8 +2155,7 @@ ; MOVREL-NEXT: v_mov_b32_e32 v21, v2 ; MOVREL-NEXT: v_mov_b32_e32 v20, v1 ; MOVREL-NEXT: v_movreld_b32_e32 v19, v16 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v19, v17 +; MOVREL-NEXT: v_movreld_b32_e32 v20, v17 ; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo ; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo ; MOVREL-NEXT: s_cbranch_execnz BB32_1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir @@ -500,19 +500,15 @@ ; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 - ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; MOVREL: $m0 = COPY [[S_ADD_U32_]] - ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0, implicit $exec ; MOVREL: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]] ; GPRIDX-LABEL: name: insert_vector_elt_vvs_s32_v8s32_add_1 ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 - ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GPRIDX: S_SET_GPR_IDX_ON [[S_ADD_U32_]], 8, implicit-def $m0, implicit $m0 - ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; GPRIDX: S_SET_GPR_IDX_ON [[COPY2]], 8, implicit-def $m0, implicit $m0 + ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0, implicit $exec ; GPRIDX: S_SET_GPR_IDX_OFF ; GPRIDX: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]] %0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -574,19 +570,15 @@ ; MOVREL: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 - ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; MOVREL: $m0 = COPY [[S_ADD_U32_]] - ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0 ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]] ; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v8s32_add_1 ; GPRIDX: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 - ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 - ; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc - ; GPRIDX: $m0 = COPY [[S_ADD_U32_]] - ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 2, implicit $m0 ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]] %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 %1:sgpr(s32) = COPY $sgpr8