diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -185,6 +185,9 @@ bool foldExtractEltToCmpSelect(MachineInstr &MI, MachineRegisterInfo &MRI, const OperandsMapper &OpdMapper) const; + bool foldInsertEltToCmpSelect(MachineInstr &MI, + MachineRegisterInfo &MRI, + const OperandsMapper &OpdMapper) const; }; } // End llvm namespace. #endif diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1938,6 +1938,94 @@ return true; } +bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect( + MachineInstr &MI, MachineRegisterInfo &MRI, + const OperandsMapper &OpdMapper) const { + + Register VecReg = MI.getOperand(1).getReg(); + Register Idx = MI.getOperand(3).getReg(); + + const RegisterBank &IdxBank = + *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; + + bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank; + + LLT VecTy = MRI.getType(VecReg); + unsigned EltSize = VecTy.getScalarSizeInBits(); + unsigned NumElem = VecTy.getNumElements(); + + if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, + IsDivergentIdx)) + return false; + + MachineIRBuilder B(MI); + LLT S32 = LLT::scalar(32); + + const RegisterBank &DstBank = + *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank; + const RegisterBank &SrcBank = + *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank; + const RegisterBank &InsBank = + *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank; + + const RegisterBank &CCBank = + (DstBank == AMDGPU::SGPRRegBank && + SrcBank == AMDGPU::SGPRRegBank && + InsBank == AMDGPU::SGPRRegBank && + IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank + : AMDGPU::VCCRegBank; + LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1); + + if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) { + Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); + MRI.setRegBank(Idx, AMDGPU::VGPRRegBank); + } + + LLT EltTy = VecTy.getScalarType(); + SmallVector InsRegs(OpdMapper.getVRegs(2)); + unsigned NumLanes = InsRegs.size(); + if (!NumLanes) { + NumLanes = 1; + InsRegs.push_back(MI.getOperand(2).getReg()); + } else { + EltTy = MRI.getType(InsRegs[0]); + } + + auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg); + SmallVector Ops(NumElem * NumLanes); + + for (unsigned I = 0; I < NumElem; ++I) { + auto IC = B.buildConstant(S32, I); + MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); + auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC); + MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); + + for (unsigned L = 0; L < NumLanes; ++L) { + auto S = B.buildSelect(EltTy, Cmp, InsRegs[L], + UnmergeToEltTy.getReg(I * NumLanes + L)); + + for (unsigned N : { 0, 2, 3 }) + MRI.setRegBank(S->getOperand(N).getReg(), DstBank); + + Ops[I * NumLanes + L] = S->getOperand(0).getReg(); + } + } + + LLT MergeTy = LLT::vector(Ops.size(), EltTy); + if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) { + B.buildBuildVector(MI.getOperand(0), Ops); + } else { + auto Vec = B.buildBuildVector(MergeTy, Ops); + MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); + B.buildBitcast(MI.getOperand(0).getReg(), Vec); + } + + MRI.setRegBank(MI.getOperand(0).getReg(), DstBank); + MI.eraseFromParent(); + + return true; +} + void AMDGPURegisterBankInfo::applyMappingImpl( const OperandsMapper &OpdMapper) const { MachineInstr &MI = OpdMapper.getMI(); @@ -2665,12 +2753,15 @@ assert(OpdMapper.getVRegs(0).empty()); assert(OpdMapper.getVRegs(3).empty()); - const RegisterBank *IdxBank = - OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; - if (substituteSimpleCopyRegs(OpdMapper, 1)) MRI.setType(MI.getOperand(1).getReg(), VecTy); + if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper)) + return; + + const RegisterBank *IdxBank = + OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank; + Register SrcReg = MI.getOperand(1).getReg(); Register InsReg = MI.getOperand(2).getReg(); LLT InsTy = MRI.getType(InsReg); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -9,32 +9,43 @@ define amdgpu_ps <8 x i32> @dyn_insertelement_v8i32_s_s_s(<8 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8i32_s_s_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 m0, s11 -; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s0, s10 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s10, s2 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 1 +; GPRIDX-NEXT: s_cselect_b32 s1, s10, s3 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 2 +; GPRIDX-NEXT: s_cselect_b32 s2, s10, s4 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 3 +; GPRIDX-NEXT: s_cselect_b32 s3, s10, s5 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 4 +; GPRIDX-NEXT: s_cselect_b32 s4, s10, s6 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 5 +; GPRIDX-NEXT: s_cselect_b32 s5, s10, s7 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 6 +; GPRIDX-NEXT: s_cselect_b32 s6, s10, s8 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 7 +; GPRIDX-NEXT: s_cselect_b32 s7, s10, s9 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8i32_s_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s11 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movreld_b32 s0, s10 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_cselect_b32 s0, s10, s2 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 1 +; MOVREL-NEXT: s_cselect_b32 s1, s10, s3 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 2 +; MOVREL-NEXT: s_cselect_b32 s2, s10, s4 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 3 +; MOVREL-NEXT: s_cselect_b32 s3, s10, s5 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 4 +; MOVREL-NEXT: s_cselect_b32 s4, s10, s6 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 5 +; MOVREL-NEXT: s_cselect_b32 s5, s10, s7 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 6 +; MOVREL-NEXT: s_cselect_b32 s6, s10, s8 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 7 +; MOVREL-NEXT: s_cselect_b32 s7, s10, s9 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x i32> %vec, i32 %val, i32 %idx @@ -44,32 +55,43 @@ define amdgpu_ps <8 x i8 addrspace(3)*> @dyn_insertelement_v8p3i8_s_s_s(<8 x i8 addrspace(3)*> inreg %vec, i8 addrspace(3)* inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8p3i8_s_s_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 m0, s11 -; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s0, s10 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s10, s2 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 1 +; GPRIDX-NEXT: s_cselect_b32 s1, s10, s3 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 2 +; GPRIDX-NEXT: s_cselect_b32 s2, s10, s4 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 3 +; GPRIDX-NEXT: s_cselect_b32 s3, s10, s5 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 4 +; GPRIDX-NEXT: s_cselect_b32 s4, s10, s6 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 5 +; GPRIDX-NEXT: s_cselect_b32 s5, s10, s7 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 6 +; GPRIDX-NEXT: s_cselect_b32 s6, s10, s8 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 7 +; GPRIDX-NEXT: s_cselect_b32 s7, s10, s9 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8p3i8_s_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s11 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movreld_b32 s0, s10 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_cselect_b32 s0, s10, s2 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 1 +; MOVREL-NEXT: s_cselect_b32 s1, s10, s3 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 2 +; MOVREL-NEXT: s_cselect_b32 s2, s10, s4 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 3 +; MOVREL-NEXT: s_cselect_b32 s3, s10, s5 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 4 +; MOVREL-NEXT: s_cselect_b32 s4, s10, s6 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 5 +; MOVREL-NEXT: s_cselect_b32 s5, s10, s7 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 6 +; MOVREL-NEXT: s_cselect_b32 s6, s10, s8 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 7 +; MOVREL-NEXT: s_cselect_b32 s7, s10, s9 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x i8 addrspace(3)*> %vec, i8 addrspace(3)* %val, i32 %idx @@ -88,42 +110,32 @@ ; GPRIDX-NEXT: s_mov_b32 s6, 0x40400000 ; GPRIDX-NEXT: s_mov_b32 s5, 2.0 ; GPRIDX-NEXT: s_mov_b32 s4, 1.0 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s4 -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 -; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v2, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v16 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v17 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB2_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v8 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s4 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v9, s5 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v10, s6 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v11, s7 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v12, s8 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v13, s9 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v14, s10 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v9 ; GPRIDX-NEXT: s_setpc_b64 s[30:31] ; ; MOVREL-LABEL: dyn_insertelement_v8f32_const_s_v_v: @@ -131,49 +143,40 @@ ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_mov_b32 s11, 0x41000000 +; MOVREL-NEXT: s_mov_b32 s4, 1.0 ; MOVREL-NEXT: s_mov_b32 s10, 0x40e00000 ; MOVREL-NEXT: s_mov_b32 s9, 0x40c00000 ; MOVREL-NEXT: s_mov_b32 s8, 0x40a00000 ; MOVREL-NEXT: s_mov_b32 s7, 4.0 ; MOVREL-NEXT: s_mov_b32 s6, 0x40400000 ; MOVREL-NEXT: s_mov_b32 s5, 2.0 -; MOVREL-NEXT: s_mov_b32 s4, 1.0 -; MOVREL-NEXT: v_mov_b32_e32 v17, s11 -; MOVREL-NEXT: v_mov_b32_e32 v16, s10 -; MOVREL-NEXT: v_mov_b32_e32 v15, s9 -; MOVREL-NEXT: v_mov_b32_e32 v14, s8 -; MOVREL-NEXT: v_mov_b32_e32 v13, s7 -; MOVREL-NEXT: v_mov_b32_e32 v12, s6 -; MOVREL-NEXT: v_mov_b32_e32 v11, s5 -; MOVREL-NEXT: v_mov_b32_e32 v10, s4 -; MOVREL-NEXT: s_mov_b32 s4, exec_lo +; MOVREL-NEXT: v_mov_b32_e32 v15, s11 +; MOVREL-NEXT: v_mov_b32_e32 v8, s4 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; MOVREL-NEXT: v_mov_b32_e32 v9, s5 +; MOVREL-NEXT: v_mov_b32_e32 v10, s6 +; MOVREL-NEXT: v_mov_b32_e32 v11, s7 +; MOVREL-NEXT: v_mov_b32_e32 v12, s8 +; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; MOVREL-NEXT: v_mov_b32_e32 v13, s9 +; MOVREL-NEXT: v_mov_b32_e32 v14, s10 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s5, v1 -; MOVREL-NEXT: v_mov_b32_e32 v2, v10 -; MOVREL-NEXT: v_mov_b32_e32 v3, v11 -; MOVREL-NEXT: v_mov_b32_e32 v4, v12 -; MOVREL-NEXT: v_mov_b32_e32 v5, v13 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v1 -; MOVREL-NEXT: s_mov_b32 m0, s5 -; MOVREL-NEXT: v_mov_b32_e32 v6, v14 -; MOVREL-NEXT: v_mov_b32_e32 v7, v15 -; MOVREL-NEXT: v_mov_b32_e32 v8, v16 -; MOVREL-NEXT: v_mov_b32_e32 v9, v17 -; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB2_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s4 -; MOVREL-NEXT: v_mov_b32_e32 v0, v2 -; MOVREL-NEXT: v_mov_b32_e32 v1, v3 -; MOVREL-NEXT: v_mov_b32_e32 v2, v4 -; MOVREL-NEXT: v_mov_b32_e32 v3, v5 -; MOVREL-NEXT: v_mov_b32_e32 v4, v6 -; MOVREL-NEXT: v_mov_b32_e32 v5, v7 -; MOVREL-NEXT: v_mov_b32_e32 v6, v8 -; MOVREL-NEXT: v_mov_b32_e32 v7, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 +; MOVREL-NEXT: v_mov_b32_e32 v1, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v0, v8 ; MOVREL-NEXT: s_setpc_b64 s[30:31] entry: %insert = insertelement <8 x float> , float %val, i32 %idx @@ -191,35 +194,32 @@ ; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v9, s0 -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v8 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v8 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v16 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s10 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB3_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v7, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v7, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_v: @@ -232,42 +232,33 @@ ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v16, s7 -; MOVREL-NEXT: v_mov_b32_e32 v15, s6 -; MOVREL-NEXT: v_mov_b32_e32 v14, s5 -; MOVREL-NEXT: v_mov_b32_e32 v13, s4 -; MOVREL-NEXT: v_mov_b32_e32 v12, s3 -; MOVREL-NEXT: v_mov_b32_e32 v11, s2 -; MOVREL-NEXT: v_mov_b32_e32 v10, s1 -; MOVREL-NEXT: v_mov_b32_e32 v9, s0 -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_mov_b32_e32 v15, s7 +; MOVREL-NEXT: v_mov_b32_e32 v7, s10 +; MOVREL-NEXT: v_mov_b32_e32 v8, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; MOVREL-NEXT: v_mov_b32_e32 v9, s1 +; MOVREL-NEXT: v_mov_b32_e32 v10, s2 +; MOVREL-NEXT: v_mov_b32_e32 v11, s3 +; MOVREL-NEXT: v_mov_b32_e32 v12, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; MOVREL-NEXT: v_mov_b32_e32 v14, s6 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 -; MOVREL-NEXT: v_mov_b32_e32 v1, v9 -; MOVREL-NEXT: v_mov_b32_e32 v2, v10 -; MOVREL-NEXT: v_mov_b32_e32 v3, v11 -; MOVREL-NEXT: v_mov_b32_e32 v4, v12 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v5, v13 -; MOVREL-NEXT: v_mov_b32_e32 v6, v14 -; MOVREL-NEXT: v_mov_b32_e32 v7, v15 -; MOVREL-NEXT: v_mov_b32_e32 v8, v16 -; MOVREL-NEXT: v_movreld_b32_e32 v1, s10 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB3_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v1 -; MOVREL-NEXT: v_mov_b32_e32 v1, v2 -; MOVREL-NEXT: v_mov_b32_e32 v2, v3 -; MOVREL-NEXT: v_mov_b32_e32 v3, v4 -; MOVREL-NEXT: v_mov_b32_e32 v4, v5 -; MOVREL-NEXT: v_mov_b32_e32 v5, v6 -; MOVREL-NEXT: v_mov_b32_e32 v6, v7 -; MOVREL-NEXT: v_mov_b32_e32 v7, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v14, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0 +; MOVREL-NEXT: v_mov_b32_e32 v0, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx @@ -277,50 +268,77 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %vec, float %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 ; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s7 -; GPRIDX-NEXT: s_set_gpr_idx_on s10, gpr_idx(DST) +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v9, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 7 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 -; GPRIDX-NEXT: s_set_gpr_idx_off ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 ; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: v_mov_b32_e32 v8, v0 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: s_mov_b32 m0, s10 -; MOVREL-NEXT: v_mov_b32_e32 v1, s1 -; MOVREL-NEXT: v_mov_b32_e32 v2, s2 -; MOVREL-NEXT: v_mov_b32_e32 v3, s3 -; MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; MOVREL-NEXT: v_mov_b32_e32 v5, s5 -; MOVREL-NEXT: v_mov_b32_e32 v6, s6 -; MOVREL-NEXT: v_mov_b32_e32 v7, s7 -; MOVREL-NEXT: v_movreld_b32_e32 v0, v8 +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: v_mov_b32_e32 v15, s7 +; MOVREL-NEXT: v_mov_b32_e32 v8, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 0 +; MOVREL-NEXT: v_mov_b32_e32 v9, s1 +; MOVREL-NEXT: v_mov_b32_e32 v10, s2 +; MOVREL-NEXT: v_mov_b32_e32 v11, s3 +; MOVREL-NEXT: v_mov_b32_e32 v12, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 1 +; MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; MOVREL-NEXT: v_mov_b32_e32 v14, s6 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v9, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 2 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 3 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 4 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 5 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 6 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 7 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v0, v8 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx @@ -330,16 +348,45 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_s_s(<8 x float> %vec, float inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_s_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 -; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_mov_b32_e32 v8, s2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s3, 2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s3, 3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s3, 4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s3, 5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s3, 6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s3, 7 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_v_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s3 +; MOVREL-NEXT: v_mov_b32_e32 v8, s2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: v_movreld_b32_e32 v0, s2 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 2 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 3 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 4 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 5 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 6 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 7 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx @@ -357,36 +404,32 @@ ; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s0 -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB6_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v1, v9 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_v_v: @@ -399,42 +442,33 @@ ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v17, s7 -; MOVREL-NEXT: v_mov_b32_e32 v16, s6 -; MOVREL-NEXT: v_mov_b32_e32 v15, s5 -; MOVREL-NEXT: v_mov_b32_e32 v14, s4 -; MOVREL-NEXT: v_mov_b32_e32 v13, s3 -; MOVREL-NEXT: v_mov_b32_e32 v12, s2 -; MOVREL-NEXT: v_mov_b32_e32 v11, s1 -; MOVREL-NEXT: v_mov_b32_e32 v10, s0 -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_mov_b32_e32 v15, s7 +; MOVREL-NEXT: v_mov_b32_e32 v8, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; MOVREL-NEXT: v_mov_b32_e32 v9, s1 +; MOVREL-NEXT: v_mov_b32_e32 v10, s2 +; MOVREL-NEXT: v_mov_b32_e32 v11, s3 +; MOVREL-NEXT: v_mov_b32_e32 v12, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; MOVREL-NEXT: v_mov_b32_e32 v14, s6 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_mov_b32_e32 v2, v10 -; MOVREL-NEXT: v_mov_b32_e32 v3, v11 -; MOVREL-NEXT: v_mov_b32_e32 v4, v12 -; MOVREL-NEXT: v_mov_b32_e32 v5, v13 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v6, v14 -; MOVREL-NEXT: v_mov_b32_e32 v7, v15 -; MOVREL-NEXT: v_mov_b32_e32 v8, v16 -; MOVREL-NEXT: v_mov_b32_e32 v9, v17 -; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB6_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v2 -; MOVREL-NEXT: v_mov_b32_e32 v1, v3 -; MOVREL-NEXT: v_mov_b32_e32 v2, v4 -; MOVREL-NEXT: v_mov_b32_e32 v3, v5 -; MOVREL-NEXT: v_mov_b32_e32 v4, v6 -; MOVREL-NEXT: v_mov_b32_e32 v5, v7 -; MOVREL-NEXT: v_mov_b32_e32 v6, v8 -; MOVREL-NEXT: v_mov_b32_e32 v7, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v1 +; MOVREL-NEXT: v_mov_b32_e32 v1, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v0, v8 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx @@ -444,66 +478,45 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_s_v(<8 x float> %vec, float inreg %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_s_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s3, v8 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s3, v8 -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v16, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s2 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB7_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_v_s_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_mov_b32_e32 v9, s2 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v8 -; MOVREL-NEXT: v_mov_b32_e32 v16, v7 -; MOVREL-NEXT: v_mov_b32_e32 v9, v0 -; MOVREL-NEXT: v_mov_b32_e32 v15, v6 -; MOVREL-NEXT: v_mov_b32_e32 v14, v5 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v8 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v13, v4 -; MOVREL-NEXT: v_mov_b32_e32 v12, v3 -; MOVREL-NEXT: v_mov_b32_e32 v11, v2 -; MOVREL-NEXT: v_mov_b32_e32 v10, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v9, s2 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB7_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: v_mov_b32_e32 v1, v10 -; MOVREL-NEXT: v_mov_b32_e32 v2, v11 -; MOVREL-NEXT: v_mov_b32_e32 v3, v12 -; MOVREL-NEXT: v_mov_b32_e32 v4, v13 -; MOVREL-NEXT: v_mov_b32_e32 v5, v14 -; MOVREL-NEXT: v_mov_b32_e32 v6, v15 -; MOVREL-NEXT: v_mov_b32_e32 v7, v16 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx @@ -513,16 +526,43 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_s(<8 x float> %vec, float %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 -; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: v_movreld_b32_e32 v0, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 7 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx @@ -532,16 +572,43 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8p3i8_v_v_s(<8 x i8 addrspace(3)*> %vec, i8 addrspace(3)* %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8p3i8_v_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 -; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8p3i8_v_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: v_movreld_b32_e32 v0, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 7 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x i8 addrspace(3)*> %vec, i8 addrspace(3)* %val, i32 %idx @@ -553,66 +620,43 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v(<8 x float> %vec, float %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB10_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v17, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v16, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v8 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB10_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB10_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v9 -; MOVREL-NEXT: v_mov_b32_e32 v17, v7 -; MOVREL-NEXT: v_mov_b32_e32 v10, v0 -; MOVREL-NEXT: v_mov_b32_e32 v16, v6 -; MOVREL-NEXT: v_mov_b32_e32 v15, v5 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v14, v4 -; MOVREL-NEXT: v_mov_b32_e32 v13, v3 -; MOVREL-NEXT: v_mov_b32_e32 v12, v2 -; MOVREL-NEXT: v_mov_b32_e32 v11, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v10, v8 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB10_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v10 -; MOVREL-NEXT: v_mov_b32_e32 v1, v11 -; MOVREL-NEXT: v_mov_b32_e32 v2, v12 -; MOVREL-NEXT: v_mov_b32_e32 v3, v13 -; MOVREL-NEXT: v_mov_b32_e32 v4, v14 -; MOVREL-NEXT: v_mov_b32_e32 v5, v15 -; MOVREL-NEXT: v_mov_b32_e32 v6, v16 -; MOVREL-NEXT: v_mov_b32_e32 v7, v17 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <8 x float> %vec, float %val, i32 %idx @@ -726,6 +770,7 @@ ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GPRIDX-NEXT: s_mov_b32 s18, 0 +; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0 ; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 ; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 ; GPRIDX-NEXT: s_mov_b32 s16, s18 @@ -737,55 +782,46 @@ ; GPRIDX-NEXT: s_mov_b32 s9, 0x40080000 ; GPRIDX-NEXT: s_mov_b32 s8, s18 ; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 -; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0 -; GPRIDX-NEXT: v_mov_b32_e32 v34, s19 -; GPRIDX-NEXT: v_mov_b32_e32 v33, s18 -; GPRIDX-NEXT: v_mov_b32_e32 v32, s17 -; GPRIDX-NEXT: v_mov_b32_e32 v31, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v30, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v29, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v28, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v27, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v26, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v25, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v24, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v23, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v22, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v21, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v20, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v19, s4 -; GPRIDX-NEXT: s_mov_b64 s[4:5], exec -; GPRIDX-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v2 -; GPRIDX-NEXT: s_lshl_b32 s7, s6, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 -; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v3, v19 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v20 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v21 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v22 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v23 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v24 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v25 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v26 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v27 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v28 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v29 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v30 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v31 -; GPRIDX-NEXT: v_mov_b32_e32 v16, v32 -; GPRIDX-NEXT: v_mov_b32_e32 v17, v33 -; GPRIDX-NEXT: v_mov_b32_e32 v18, v34 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v4, v1 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB13_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] +; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s6 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 5, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[14:15], 7, v2 +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[16:17] +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[14:15] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[14:15] ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off @@ -798,6 +834,7 @@ ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_mov_b32 s18, 0 +; MOVREL-NEXT: s_mov_b64 s[4:5], 1.0 ; MOVREL-NEXT: s_mov_b32 s19, 0x40200000 ; MOVREL-NEXT: s_mov_b32 s17, 0x401c0000 ; MOVREL-NEXT: s_mov_b32 s16, s18 @@ -809,52 +846,47 @@ ; MOVREL-NEXT: s_mov_b32 s9, 0x40080000 ; MOVREL-NEXT: s_mov_b32 s8, s18 ; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0 -; MOVREL-NEXT: s_mov_b64 s[4:5], 1.0 -; MOVREL-NEXT: v_mov_b32_e32 v34, s19 -; MOVREL-NEXT: v_mov_b32_e32 v33, s18 -; MOVREL-NEXT: v_mov_b32_e32 v32, s17 -; MOVREL-NEXT: v_mov_b32_e32 v31, s16 -; MOVREL-NEXT: v_mov_b32_e32 v30, s15 -; MOVREL-NEXT: v_mov_b32_e32 v29, s14 -; MOVREL-NEXT: v_mov_b32_e32 v28, s13 -; MOVREL-NEXT: v_mov_b32_e32 v27, s12 -; MOVREL-NEXT: v_mov_b32_e32 v26, s11 -; MOVREL-NEXT: v_mov_b32_e32 v25, s10 -; MOVREL-NEXT: v_mov_b32_e32 v24, s9 -; MOVREL-NEXT: v_mov_b32_e32 v23, s8 -; MOVREL-NEXT: v_mov_b32_e32 v22, s7 -; MOVREL-NEXT: v_mov_b32_e32 v21, s6 -; MOVREL-NEXT: v_mov_b32_e32 v20, s5 -; MOVREL-NEXT: v_mov_b32_e32 v19, s4 -; MOVREL-NEXT: s_mov_b32 s4, exec_lo +; MOVREL-NEXT: v_mov_b32_e32 v3, s4 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; MOVREL-NEXT: v_mov_b32_e32 v4, s5 +; MOVREL-NEXT: v_mov_b32_e32 v5, s6 +; MOVREL-NEXT: v_mov_b32_e32 v6, s7 +; MOVREL-NEXT: v_mov_b32_e32 v7, s8 +; MOVREL-NEXT: v_mov_b32_e32 v8, s9 +; MOVREL-NEXT: v_mov_b32_e32 v9, s10 +; MOVREL-NEXT: v_mov_b32_e32 v10, s11 +; MOVREL-NEXT: v_mov_b32_e32 v11, s12 +; MOVREL-NEXT: v_mov_b32_e32 v12, s13 +; MOVREL-NEXT: v_mov_b32_e32 v13, s14 +; MOVREL-NEXT: v_mov_b32_e32 v14, s15 +; MOVREL-NEXT: v_mov_b32_e32 v15, s16 +; MOVREL-NEXT: v_mov_b32_e32 v16, s17 +; MOVREL-NEXT: v_mov_b32_e32 v17, s18 +; MOVREL-NEXT: v_mov_b32_e32 v18, s19 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 4, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s7, 5, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 3, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s8, 6, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s9, 7, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s10, 2, v2 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v0, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v0, s6 +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v0, s7 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v1, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v1, s6 +; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, v1, s7 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v0, s10 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v0, s5 +; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, v0, s8 +; MOVREL-NEXT: v_cndmask_b32_e64 v17, v17, v0, s9 +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v1, s10 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v1, s5 +; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, v1, s8 +; MOVREL-NEXT: v_cndmask_b32_e64 v18, v18, v1, s9 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s5, v2 -; MOVREL-NEXT: v_mov_b32_e32 v3, v19 -; MOVREL-NEXT: v_mov_b32_e32 v4, v20 -; MOVREL-NEXT: v_mov_b32_e32 v5, v21 -; MOVREL-NEXT: v_mov_b32_e32 v6, v22 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 -; MOVREL-NEXT: s_lshl_b32 m0, s5, 1 -; MOVREL-NEXT: v_mov_b32_e32 v7, v23 -; MOVREL-NEXT: v_mov_b32_e32 v8, v24 -; MOVREL-NEXT: v_mov_b32_e32 v9, v25 -; MOVREL-NEXT: v_mov_b32_e32 v10, v26 -; MOVREL-NEXT: v_mov_b32_e32 v11, v27 -; MOVREL-NEXT: v_mov_b32_e32 v12, v28 -; MOVREL-NEXT: v_mov_b32_e32 v13, v29 -; MOVREL-NEXT: v_mov_b32_e32 v14, v30 -; MOVREL-NEXT: v_mov_b32_e32 v15, v31 -; MOVREL-NEXT: v_mov_b32_e32 v16, v32 -; MOVREL-NEXT: v_mov_b32_e32 v17, v33 -; MOVREL-NEXT: v_mov_b32_e32 v18, v34 -; MOVREL-NEXT: v_movreld_b32_e32 v3, v0 -; MOVREL-NEXT: v_movreld_b32_e32 v4, v1 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB13_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s4 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off @@ -893,54 +925,48 @@ ; GPRIDX-NEXT: s_mov_b32 s10, s12 ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v32, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v31, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v30, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v29, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v28, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v27, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v26, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v25, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v24, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v23, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v22, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v21, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v20, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v19, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s0 -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB14_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v0 -; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v1, v17 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v18 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v19 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v20 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v21 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v22 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v23 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v24 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v25 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v26 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v27 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v28 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v29 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v30 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v31 -; GPRIDX-NEXT: v_mov_b32_e32 v16, v32 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s18 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v2, s19 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB14_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s19 +; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v0, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v0, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v0, s[10:11] ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[5:8], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[9:12], off @@ -965,53 +991,51 @@ ; MOVREL-NEXT: s_mov_b32 s10, s12 ; MOVREL-NEXT: s_mov_b32 s12, s14 ; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: v_mov_b32_e32 v32, s15 -; MOVREL-NEXT: v_mov_b32_e32 v31, s14 -; MOVREL-NEXT: v_mov_b32_e32 v30, s13 -; MOVREL-NEXT: v_mov_b32_e32 v29, s12 -; MOVREL-NEXT: v_mov_b32_e32 v28, s11 -; MOVREL-NEXT: v_mov_b32_e32 v27, s10 -; MOVREL-NEXT: v_mov_b32_e32 v26, s9 -; MOVREL-NEXT: v_mov_b32_e32 v25, s8 -; MOVREL-NEXT: v_mov_b32_e32 v24, s7 -; MOVREL-NEXT: v_mov_b32_e32 v23, s6 -; MOVREL-NEXT: v_mov_b32_e32 v22, s5 -; MOVREL-NEXT: v_mov_b32_e32 v21, s4 -; MOVREL-NEXT: v_mov_b32_e32 v20, s3 -; MOVREL-NEXT: v_mov_b32_e32 v19, s2 -; MOVREL-NEXT: v_mov_b32_e32 v18, s1 -; MOVREL-NEXT: v_mov_b32_e32 v17, s0 -; MOVREL-NEXT: s_mov_b32 s0, exec_lo -; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB14_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 -; MOVREL-NEXT: v_mov_b32_e32 v1, v17 -; MOVREL-NEXT: v_mov_b32_e32 v2, v18 -; MOVREL-NEXT: v_mov_b32_e32 v3, v19 -; MOVREL-NEXT: v_mov_b32_e32 v4, v20 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0 -; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 -; MOVREL-NEXT: v_mov_b32_e32 v5, v21 -; MOVREL-NEXT: v_mov_b32_e32 v6, v22 -; MOVREL-NEXT: v_mov_b32_e32 v7, v23 -; MOVREL-NEXT: v_mov_b32_e32 v8, v24 -; MOVREL-NEXT: v_mov_b32_e32 v9, v25 -; MOVREL-NEXT: v_mov_b32_e32 v10, v26 -; MOVREL-NEXT: v_mov_b32_e32 v11, v27 -; MOVREL-NEXT: v_mov_b32_e32 v12, v28 -; MOVREL-NEXT: v_mov_b32_e32 v13, v29 -; MOVREL-NEXT: v_mov_b32_e32 v14, v30 -; MOVREL-NEXT: v_mov_b32_e32 v15, v31 -; MOVREL-NEXT: v_mov_b32_e32 v16, v32 -; MOVREL-NEXT: v_movreld_b32_e32 v1, s18 -; MOVREL-NEXT: v_movreld_b32_e32 v2, s19 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB14_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: v_mov_b32_e32 v16, s15 +; MOVREL-NEXT: v_mov_b32_e32 v15, s14 +; MOVREL-NEXT: v_mov_b32_e32 v14, s13 +; MOVREL-NEXT: v_mov_b32_e32 v13, s12 +; MOVREL-NEXT: v_mov_b32_e32 v12, s11 +; MOVREL-NEXT: v_mov_b32_e32 v11, s10 +; MOVREL-NEXT: v_mov_b32_e32 v10, s9 +; MOVREL-NEXT: v_mov_b32_e32 v9, s8 +; MOVREL-NEXT: v_mov_b32_e32 v8, s7 +; MOVREL-NEXT: v_mov_b32_e32 v7, s6 +; MOVREL-NEXT: v_mov_b32_e32 v6, s5 +; MOVREL-NEXT: v_mov_b32_e32 v5, s4 +; MOVREL-NEXT: v_mov_b32_e32 v4, s3 +; MOVREL-NEXT: v_mov_b32_e32 v3, s2 +; MOVREL-NEXT: v_mov_b32_e32 v2, s1 +; MOVREL-NEXT: v_mov_b32_e32 v1, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; MOVREL-NEXT: s_mov_b32 s30, s18 +; MOVREL-NEXT: s_mov_b32 s31, s19 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 5, v0 +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s30, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s31, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 4, v0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 6, v0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 7, v0 +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s30, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s31, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s30, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s31, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s30, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s31, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s30, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s31, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s30, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s31, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s30, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s31, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s30, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, s31, s4 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[1:4], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[5:8], off +; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[9:12], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[13:16], off ; MOVREL-NEXT: s_endpgm @@ -1187,54 +1211,46 @@ ; GPRIDX-NEXT: s_mov_b32 s10, s12 ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v34, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v33, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v32, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v31, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v30, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v29, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v28, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v27, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v26, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v25, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v24, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v23, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v22, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v21, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v20, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v19, s0 -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB17_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 -; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v3, v19 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v20 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v21 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v22 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v23 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v24 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v25 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v26 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v27 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v28 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v29 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v30 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v31 -; GPRIDX-NEXT: v_mov_b32_e32 v16, v32 -; GPRIDX-NEXT: v_mov_b32_e32 v17, v33 -; GPRIDX-NEXT: v_mov_b32_e32 v18, v34 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v4, v1 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB17_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v2 +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v0, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v0, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v17, v17, v0, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v16, v16, v1, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v18, v18, v1, s[10:11] ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off @@ -1259,51 +1275,47 @@ ; MOVREL-NEXT: s_mov_b32 s10, s12 ; MOVREL-NEXT: s_mov_b32 s12, s14 ; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: v_mov_b32_e32 v34, s15 -; MOVREL-NEXT: v_mov_b32_e32 v33, s14 -; MOVREL-NEXT: v_mov_b32_e32 v32, s13 -; MOVREL-NEXT: v_mov_b32_e32 v31, s12 -; MOVREL-NEXT: v_mov_b32_e32 v30, s11 -; MOVREL-NEXT: v_mov_b32_e32 v29, s10 -; MOVREL-NEXT: v_mov_b32_e32 v28, s9 -; MOVREL-NEXT: v_mov_b32_e32 v27, s8 -; MOVREL-NEXT: v_mov_b32_e32 v26, s7 -; MOVREL-NEXT: v_mov_b32_e32 v25, s6 -; MOVREL-NEXT: v_mov_b32_e32 v24, s5 -; MOVREL-NEXT: v_mov_b32_e32 v23, s4 -; MOVREL-NEXT: v_mov_b32_e32 v22, s3 -; MOVREL-NEXT: v_mov_b32_e32 v21, s2 -; MOVREL-NEXT: v_mov_b32_e32 v20, s1 -; MOVREL-NEXT: v_mov_b32_e32 v19, s0 -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_mov_b32_e32 v18, s15 +; MOVREL-NEXT: v_mov_b32_e32 v17, s14 +; MOVREL-NEXT: v_mov_b32_e32 v16, s13 +; MOVREL-NEXT: v_mov_b32_e32 v15, s12 +; MOVREL-NEXT: v_mov_b32_e32 v14, s11 +; MOVREL-NEXT: v_mov_b32_e32 v13, s10 +; MOVREL-NEXT: v_mov_b32_e32 v12, s9 +; MOVREL-NEXT: v_mov_b32_e32 v11, s8 +; MOVREL-NEXT: v_mov_b32_e32 v10, s7 +; MOVREL-NEXT: v_mov_b32_e32 v9, s6 +; MOVREL-NEXT: v_mov_b32_e32 v8, s5 +; MOVREL-NEXT: v_mov_b32_e32 v7, s4 +; MOVREL-NEXT: v_mov_b32_e32 v6, s3 +; MOVREL-NEXT: v_mov_b32_e32 v5, s2 +; MOVREL-NEXT: v_mov_b32_e32 v4, s1 +; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 4, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 5, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 6, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 2, v2 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v0, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v0, s3 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v1, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, v1, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v0, s6 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v0, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, v0, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v17, v17, v0, s5 +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v1, s6 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v1, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v16, v16, v1, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v18, v18, v1, s5 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB17_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v2 -; MOVREL-NEXT: v_mov_b32_e32 v3, v19 -; MOVREL-NEXT: v_mov_b32_e32 v4, v20 -; MOVREL-NEXT: v_mov_b32_e32 v5, v21 -; MOVREL-NEXT: v_mov_b32_e32 v6, v22 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v2 -; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 -; MOVREL-NEXT: v_mov_b32_e32 v7, v23 -; MOVREL-NEXT: v_mov_b32_e32 v8, v24 -; MOVREL-NEXT: v_mov_b32_e32 v9, v25 -; MOVREL-NEXT: v_mov_b32_e32 v10, v26 -; MOVREL-NEXT: v_mov_b32_e32 v11, v27 -; MOVREL-NEXT: v_mov_b32_e32 v12, v28 -; MOVREL-NEXT: v_mov_b32_e32 v13, v29 -; MOVREL-NEXT: v_mov_b32_e32 v14, v30 -; MOVREL-NEXT: v_mov_b32_e32 v15, v31 -; MOVREL-NEXT: v_mov_b32_e32 v16, v32 -; MOVREL-NEXT: v_mov_b32_e32 v17, v33 -; MOVREL-NEXT: v_mov_b32_e32 v18, v34 -; MOVREL-NEXT: v_movreld_b32_e32 v3, v0 -; MOVREL-NEXT: v_movreld_b32_e32 v4, v1 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB17_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off ; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off @@ -1325,79 +1337,69 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double inreg %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_s_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB18_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s4, v16 -; GPRIDX-NEXT: s_lshl_b32 s5, s4, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 -; GPRIDX-NEXT: s_set_gpr_idx_on s5, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v32, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v31, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v30, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v29, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v28, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v27, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v26, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v25, v8 -; GPRIDX-NEXT: v_mov_b32_e32 v24, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v23, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v22, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v21, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v20, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v19, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v18, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v17, v0 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s2 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s5, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v18, s3 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB18_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[17:20], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[21:24], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[25:28], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[29:32], off +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 5, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 7, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[14:15], 6, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v17, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v16, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v16, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v17, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v17, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v17, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v17, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v17, s[14:15] +; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v17, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v16, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v16, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v16, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v16, s[14:15] +; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[12:13] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB18_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v16 -; MOVREL-NEXT: v_mov_b32_e32 v32, v15 -; MOVREL-NEXT: v_mov_b32_e32 v17, v0 -; MOVREL-NEXT: v_mov_b32_e32 v31, v14 -; MOVREL-NEXT: v_mov_b32_e32 v30, v13 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v16 -; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 -; MOVREL-NEXT: v_mov_b32_e32 v29, v12 -; MOVREL-NEXT: v_mov_b32_e32 v28, v11 -; MOVREL-NEXT: v_mov_b32_e32 v27, v10 -; MOVREL-NEXT: v_mov_b32_e32 v26, v9 -; MOVREL-NEXT: v_mov_b32_e32 v25, v8 -; MOVREL-NEXT: v_mov_b32_e32 v24, v7 -; MOVREL-NEXT: v_mov_b32_e32 v23, v6 -; MOVREL-NEXT: v_mov_b32_e32 v22, v5 -; MOVREL-NEXT: v_mov_b32_e32 v21, v4 -; MOVREL-NEXT: v_mov_b32_e32 v20, v3 -; MOVREL-NEXT: v_mov_b32_e32 v19, v2 -; MOVREL-NEXT: v_mov_b32_e32 v18, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v17, s2 -; MOVREL-NEXT: v_movreld_b32_e32 v18, s3 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB18_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[17:20], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[21:24], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[25:28], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[29:32], off +; MOVREL-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v16 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s2, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s3, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, s2, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v16 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s2, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, s3, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v16 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, s2, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, s3, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v16 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, s2, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, s3, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v16 +; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, s2, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, s3, vcc_lo +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1453,79 +1455,69 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB20_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v18 -; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18 -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v34, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v33, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v32, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v31, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v30, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v29, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v28, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v27, v8 -; GPRIDX-NEXT: v_mov_b32_e32 v26, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v25, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v24, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v23, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v22, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v21, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v20, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v19, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v19, v16 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v20, v17 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB20_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 2, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 4, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 5, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v18 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v16, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v16, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v16, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v16, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, exec_lo -; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB20_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v18 -; MOVREL-NEXT: v_mov_b32_e32 v34, v15 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18 ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 -; MOVREL-NEXT: v_mov_b32_e32 v33, v14 -; MOVREL-NEXT: v_mov_b32_e32 v32, v13 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v18 -; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 -; MOVREL-NEXT: v_mov_b32_e32 v31, v12 -; MOVREL-NEXT: v_mov_b32_e32 v30, v11 -; MOVREL-NEXT: v_mov_b32_e32 v29, v10 -; MOVREL-NEXT: v_mov_b32_e32 v28, v9 -; MOVREL-NEXT: v_mov_b32_e32 v27, v8 -; MOVREL-NEXT: v_mov_b32_e32 v26, v7 -; MOVREL-NEXT: v_mov_b32_e32 v25, v6 -; MOVREL-NEXT: v_mov_b32_e32 v24, v5 -; MOVREL-NEXT: v_mov_b32_e32 v23, v4 -; MOVREL-NEXT: v_mov_b32_e32 v22, v3 -; MOVREL-NEXT: v_mov_b32_e32 v21, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 ; MOVREL-NEXT: v_mov_b32_e32 v20, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v19, v16 -; MOVREL-NEXT: v_movreld_b32_e32 v20, v17 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB20_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 6, v18 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v19, v16, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v16, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v16, s6 +; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, v16, s5 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v17, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v17, s6 +; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, v17, s5 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; MOVREL-NEXT: s_endpgm entry: %insert = insertelement <8 x double> %vec, double %val, i32 %idx @@ -1543,22 +1535,23 @@ define amdgpu_ps <3 x i32> @dyn_insertelement_v3i32_s_s_s(<3 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v3i32_s_s_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 m0, s6 -; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s0, s5 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s5, s2 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 1 +; GPRIDX-NEXT: s_cselect_b32 s1, s5, s3 +; GPRIDX-NEXT: s_cmp_eq_u32 s6, 2 +; GPRIDX-NEXT: s_cselect_b32 s2, s5, s4 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v3i32_s_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s6 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_movreld_b32 s0, s5 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_cselect_b32 s0, s5, s2 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 1 +; MOVREL-NEXT: s_cselect_b32 s1, s5, s3 +; MOVREL-NEXT: s_cmp_eq_u32 s6, 2 +; MOVREL-NEXT: s_cselect_b32 s2, s5, s4 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <3 x i32> %vec, i32 %val, i32 %idx @@ -1568,16 +1561,23 @@ define amdgpu_ps <3 x float> @dyn_insertelement_v3i32_v_v_s(<3 x float> %vec, float %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v3i32_v_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v3 -; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v3i32_v_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: v_movreld_b32_e32 v0, v3 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <3 x float> %vec, float %val, i32 %idx @@ -1587,26 +1587,31 @@ define amdgpu_ps <5 x i32> @dyn_insertelement_v5i32_s_s_s(<5 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5i32_s_s_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 m0, s8 -; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s0, s7 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s7, s2 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 +; GPRIDX-NEXT: s_cselect_b32 s1, s7, s3 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 2 +; GPRIDX-NEXT: s_cselect_b32 s2, s7, s4 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 3 +; GPRIDX-NEXT: s_cselect_b32 s3, s7, s5 +; GPRIDX-NEXT: s_cmp_eq_u32 s8, 4 +; GPRIDX-NEXT: s_cselect_b32 s4, s7, s6 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v5i32_s_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s8 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_movreld_b32 s0, s7 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_cselect_b32 s0, s7, s2 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 +; MOVREL-NEXT: s_cselect_b32 s1, s7, s3 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 2 +; MOVREL-NEXT: s_cselect_b32 s2, s7, s4 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 3 +; MOVREL-NEXT: s_cselect_b32 s3, s7, s5 +; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 +; MOVREL-NEXT: s_cselect_b32 s4, s7, s6 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <5 x i32> %vec, i32 %val, i32 %idx @@ -1616,16 +1621,31 @@ define amdgpu_ps <5 x float> @dyn_insertelement_v5i32_v_v_s(<5 x float> %vec, float %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5i32_v_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v5 -; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v5i32_v_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: v_movreld_b32_e32 v0, v5 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <5 x float> %vec, float %val, i32 %idx @@ -1737,17 +1757,23 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_1(<8 x float> inreg %vec, float inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_s_s_add_1: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 m0, s11 -; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s1, s10 +; GPRIDX-NEXT: s_add_i32 s11, s11, 1 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s10, s2 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 1 +; GPRIDX-NEXT: s_cselect_b32 s1, s10, s3 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 2 +; GPRIDX-NEXT: s_cselect_b32 s2, s10, s4 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 3 +; GPRIDX-NEXT: s_cselect_b32 s3, s10, s5 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 4 +; GPRIDX-NEXT: s_cselect_b32 s4, s10, s6 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 5 +; GPRIDX-NEXT: s_cselect_b32 s5, s10, s7 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 6 +; GPRIDX-NEXT: s_cselect_b32 s6, s10, s8 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 7 +; GPRIDX-NEXT: s_cselect_b32 s7, s10, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 @@ -1760,25 +1786,32 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_s_add_1: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 m0, s11 -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_movreld_b32 s1, s10 +; MOVREL-NEXT: s_add_i32 s11, s11, 1 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_cmp_eq_u32 s11, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s10, s2 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 1 ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; MOVREL-NEXT: s_cselect_b32 s1, s10, s3 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 2 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: s_cselect_b32 s2, s10, s4 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 +; MOVREL-NEXT: s_cselect_b32 s3, s10, s5 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 4 ; MOVREL-NEXT: v_mov_b32_e32 v3, s3 +; MOVREL-NEXT: s_cselect_b32 s4, s10, s6 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 5 ; MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; MOVREL-NEXT: s_cselect_b32 s5, s10, s7 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 6 ; MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; MOVREL-NEXT: s_cselect_b32 s6, s10, s8 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 7 ; MOVREL-NEXT: v_mov_b32_e32 v6, s6 +; MOVREL-NEXT: s_cselect_b32 s7, s10, s9 ; MOVREL-NEXT: v_mov_b32_e32 v7, s7 -; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: ; return to shader part epilog entry: %idx.add = add i32 %idx, 1 @@ -1789,17 +1822,23 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_7(<8 x float> inreg %vec, float inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_s_s_add_7: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 m0, s11 -; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s7, s10 +; GPRIDX-NEXT: s_add_i32 s11, s11, 7 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s10, s2 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 1 +; GPRIDX-NEXT: s_cselect_b32 s1, s10, s3 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 2 +; GPRIDX-NEXT: s_cselect_b32 s2, s10, s4 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 3 +; GPRIDX-NEXT: s_cselect_b32 s3, s10, s5 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 4 +; GPRIDX-NEXT: s_cselect_b32 s4, s10, s6 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 5 +; GPRIDX-NEXT: s_cselect_b32 s5, s10, s7 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 6 +; GPRIDX-NEXT: s_cselect_b32 s6, s10, s8 +; GPRIDX-NEXT: s_cmp_eq_u32 s11, 7 +; GPRIDX-NEXT: s_cselect_b32 s7, s10, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 @@ -1812,25 +1851,32 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_s_add_7: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 m0, s11 -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_movreld_b32 s7, s10 +; MOVREL-NEXT: s_add_i32 s11, s11, 7 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_cmp_eq_u32 s11, 0 +; MOVREL-NEXT: s_cselect_b32 s0, s10, s2 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 1 ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; MOVREL-NEXT: s_cselect_b32 s1, s10, s3 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 2 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: s_cselect_b32 s2, s10, s4 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 3 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 +; MOVREL-NEXT: s_cselect_b32 s3, s10, s5 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 4 ; MOVREL-NEXT: v_mov_b32_e32 v3, s3 +; MOVREL-NEXT: s_cselect_b32 s4, s10, s6 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 5 ; MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; MOVREL-NEXT: s_cselect_b32 s5, s10, s7 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 6 ; MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; MOVREL-NEXT: s_cselect_b32 s6, s10, s8 +; MOVREL-NEXT: s_cmp_eq_u32 s11, 7 ; MOVREL-NEXT: v_mov_b32_e32 v6, s6 +; MOVREL-NEXT: s_cselect_b32 s7, s10, s9 ; MOVREL-NEXT: v_mov_b32_e32 v7, s7 -; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: ; return to shader part epilog entry: %idx.add = add i32 %idx, 7 @@ -1841,66 +1887,45 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_1(<8 x float> %vec, float %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB29_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v17, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v16, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v8 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB29_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 +; GPRIDX-NEXT: v_add_u32_e32 v9, 1, v9 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_add_nc_u32_e32 v9, 1, v9 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB29_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v9 -; MOVREL-NEXT: v_mov_b32_e32 v17, v7 -; MOVREL-NEXT: v_mov_b32_e32 v11, v1 -; MOVREL-NEXT: v_mov_b32_e32 v16, v6 -; MOVREL-NEXT: v_mov_b32_e32 v15, v5 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v14, v4 -; MOVREL-NEXT: v_mov_b32_e32 v13, v3 -; MOVREL-NEXT: v_mov_b32_e32 v12, v2 -; MOVREL-NEXT: v_mov_b32_e32 v10, v0 -; MOVREL-NEXT: v_movreld_b32_e32 v11, v8 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB29_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v10 -; MOVREL-NEXT: v_mov_b32_e32 v1, v11 -; MOVREL-NEXT: v_mov_b32_e32 v2, v12 -; MOVREL-NEXT: v_mov_b32_e32 v3, v13 -; MOVREL-NEXT: v_mov_b32_e32 v4, v14 -; MOVREL-NEXT: v_mov_b32_e32 v5, v15 -; MOVREL-NEXT: v_mov_b32_e32 v6, v16 -; MOVREL-NEXT: v_mov_b32_e32 v7, v17 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %idx.add = add i32 %idx, 1 @@ -1911,66 +1936,45 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_7(<8 x float> %vec, float %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB30_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v17, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v16, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v17, v8 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB30_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 +; GPRIDX-NEXT: v_add_u32_e32 v9, 7, v9 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v9 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_add_nc_u32_e32 v9, 7, v9 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB30_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v9 -; MOVREL-NEXT: v_mov_b32_e32 v17, v7 -; MOVREL-NEXT: v_mov_b32_e32 v16, v6 -; MOVREL-NEXT: v_mov_b32_e32 v15, v5 -; MOVREL-NEXT: v_mov_b32_e32 v14, v4 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v13, v3 -; MOVREL-NEXT: v_mov_b32_e32 v12, v2 -; MOVREL-NEXT: v_mov_b32_e32 v11, v1 -; MOVREL-NEXT: v_mov_b32_e32 v10, v0 -; MOVREL-NEXT: v_movreld_b32_e32 v17, v8 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB30_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v10 -; MOVREL-NEXT: v_mov_b32_e32 v1, v11 -; MOVREL-NEXT: v_mov_b32_e32 v2, v12 -; MOVREL-NEXT: v_mov_b32_e32 v3, v13 -; MOVREL-NEXT: v_mov_b32_e32 v4, v14 -; MOVREL-NEXT: v_mov_b32_e32 v5, v15 -; MOVREL-NEXT: v_mov_b32_e32 v6, v16 -; MOVREL-NEXT: v_mov_b32_e32 v7, v17 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v9 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %idx.add = add i32 %idx, 7 @@ -2084,81 +2088,71 @@ define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB32_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v18 -; GPRIDX-NEXT: s_add_i32 s3, s2, 1 -; GPRIDX-NEXT: s_lshl_b32 s3, s3, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18 -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v34, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v33, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v32, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v31, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v30, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v29, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v28, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v27, v8 -; GPRIDX-NEXT: v_mov_b32_e32 v26, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v25, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v24, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v23, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v22, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v21, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v20, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v19, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v19, v16 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v20, v17 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB32_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; GPRIDX-NEXT: v_add_u32_e32 v18, 1, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 2, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 4, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 5, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v18 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v16, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v17, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v16, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v16, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v16, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v16, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v16, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e64 v14, v14, v16, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v17, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v17, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v17, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v17, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v17, s[12:13] +; GPRIDX-NEXT: v_cndmask_b32_e64 v15, v15, v17, s[10:11] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; GPRIDX-NEXT: s_endpgm ; ; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, exec_lo -; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB32_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v18 -; MOVREL-NEXT: v_mov_b32_e32 v34, v15 +; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18 ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 -; MOVREL-NEXT: v_mov_b32_e32 v33, v14 -; MOVREL-NEXT: v_mov_b32_e32 v32, v13 -; MOVREL-NEXT: s_add_i32 s2, s1, 1 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v18 -; MOVREL-NEXT: s_lshl_b32 m0, s2, 1 -; MOVREL-NEXT: v_mov_b32_e32 v31, v12 -; MOVREL-NEXT: v_mov_b32_e32 v30, v11 -; MOVREL-NEXT: v_mov_b32_e32 v29, v10 -; MOVREL-NEXT: v_mov_b32_e32 v28, v9 -; MOVREL-NEXT: v_mov_b32_e32 v27, v8 -; MOVREL-NEXT: v_mov_b32_e32 v26, v7 -; MOVREL-NEXT: v_mov_b32_e32 v25, v6 -; MOVREL-NEXT: v_mov_b32_e32 v24, v5 -; MOVREL-NEXT: v_mov_b32_e32 v23, v4 -; MOVREL-NEXT: v_mov_b32_e32 v22, v3 -; MOVREL-NEXT: v_mov_b32_e32 v21, v2 ; MOVREL-NEXT: v_mov_b32_e32 v20, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v19, v16 -; MOVREL-NEXT: v_movreld_b32_e32 v20, v17 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB32_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off -; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 6, v18 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v19, v16, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v16, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v16, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v16, s6 +; MOVREL-NEXT: v_cndmask_b32_e64 v14, v14, v16, s5 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v17, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v17, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v17, s6 +; MOVREL-NEXT: v_cndmask_b32_e64 v15, v15, v17, s5 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off ; MOVREL-NEXT: s_endpgm entry: %idx.add = add i32 %idx, 1 @@ -3401,30 +3395,39 @@ define amdgpu_ps <7 x i32> @dyn_insertelement_v7i32_s_s_s(<7 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v7i32_s_s_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 m0, s10 -; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s0, s9 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s9, s2 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 1 +; GPRIDX-NEXT: s_cselect_b32 s1, s9, s3 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 2 +; GPRIDX-NEXT: s_cselect_b32 s2, s9, s4 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 3 +; GPRIDX-NEXT: s_cselect_b32 s3, s9, s5 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 4 +; GPRIDX-NEXT: s_cselect_b32 s4, s9, s6 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 5 +; GPRIDX-NEXT: s_cselect_b32 s5, s9, s7 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 6 +; GPRIDX-NEXT: s_cselect_b32 s6, s9, s8 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v7i32_s_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s10 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_movreld_b32 s0, s9 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_cselect_b32 s0, s9, s2 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 1 +; MOVREL-NEXT: s_cselect_b32 s1, s9, s3 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 2 +; MOVREL-NEXT: s_cselect_b32 s2, s9, s4 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 3 +; MOVREL-NEXT: s_cselect_b32 s3, s9, s5 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 4 +; MOVREL-NEXT: s_cselect_b32 s4, s9, s6 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 5 +; MOVREL-NEXT: s_cselect_b32 s5, s9, s7 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 6 +; MOVREL-NEXT: s_cselect_b32 s6, s9, s8 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x i32> %vec, i32 %val, i32 %idx @@ -3434,30 +3437,39 @@ define amdgpu_ps <7 x i8 addrspace(3)*> @dyn_insertelement_v7p3i8_s_s_s(<7 x i8 addrspace(3)*> inreg %vec, i8 addrspace(3)* inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v7p3i8_s_s_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 m0, s10 -; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b32 s0, s9 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 0 +; GPRIDX-NEXT: s_cselect_b32 s0, s9, s2 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 1 +; GPRIDX-NEXT: s_cselect_b32 s1, s9, s3 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 2 +; GPRIDX-NEXT: s_cselect_b32 s2, s9, s4 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 3 +; GPRIDX-NEXT: s_cselect_b32 s3, s9, s5 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 4 +; GPRIDX-NEXT: s_cselect_b32 s4, s9, s6 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 5 +; GPRIDX-NEXT: s_cselect_b32 s5, s9, s7 +; GPRIDX-NEXT: s_cmp_eq_u32 s10, 6 +; GPRIDX-NEXT: s_cselect_b32 s6, s9, s8 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v7p3i8_s_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 m0, s10 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_movreld_b32 s0, s9 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_cselect_b32 s0, s9, s2 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 1 +; MOVREL-NEXT: s_cselect_b32 s1, s9, s3 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 2 +; MOVREL-NEXT: s_cselect_b32 s2, s9, s4 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 3 +; MOVREL-NEXT: s_cselect_b32 s3, s9, s5 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 4 +; MOVREL-NEXT: s_cselect_b32 s4, s9, s6 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 5 +; MOVREL-NEXT: s_cselect_b32 s5, s9, s7 +; MOVREL-NEXT: s_cmp_eq_u32 s10, 6 +; MOVREL-NEXT: s_cselect_b32 s6, s9, s8 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x i8 addrspace(3)*> %vec, i8 addrspace(3)* %val, i32 %idx @@ -3474,18 +3486,29 @@ ; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 -; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v7, s7 -; GPRIDX-NEXT: s_set_gpr_idx_on s9, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 -; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_mov_b32_e32 v14, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v8, s1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v9, s2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v10, s3 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v11, s4 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v11, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v12, s5 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v13, s6 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v13, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v0, v7 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_s: @@ -3497,18 +3520,30 @@ ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v8, v0 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: s_mov_b32 m0, s9 -; MOVREL-NEXT: v_mov_b32_e32 v1, s1 -; MOVREL-NEXT: v_mov_b32_e32 v2, s2 -; MOVREL-NEXT: v_mov_b32_e32 v3, s3 -; MOVREL-NEXT: v_mov_b32_e32 v4, s4 -; MOVREL-NEXT: v_mov_b32_e32 v5, s5 -; MOVREL-NEXT: v_mov_b32_e32 v6, s6 -; MOVREL-NEXT: v_mov_b32_e32 v7, s7 -; MOVREL-NEXT: v_movreld_b32_e32 v0, v8 +; MOVREL-NEXT: v_mov_b32_e32 v16, s7 +; MOVREL-NEXT: v_mov_b32_e32 v9, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 0 +; MOVREL-NEXT: v_mov_b32_e32 v10, s1 +; MOVREL-NEXT: v_mov_b32_e32 v11, s2 +; MOVREL-NEXT: v_mov_b32_e32 v12, s3 +; MOVREL-NEXT: v_mov_b32_e32 v13, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 1 +; MOVREL-NEXT: v_mov_b32_e32 v14, s5 +; MOVREL-NEXT: v_mov_b32_e32 v15, s6 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v10, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 2 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v11, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 3 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v12, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 4 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v13, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 5 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v14, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 6 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v15, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v0, v7 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x float> %vec, float %val, i32 %idx @@ -3525,36 +3560,30 @@ ; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v11, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v10, s0 -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB46_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc +; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc ; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB46_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v1, v7 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_v: @@ -3566,41 +3595,31 @@ ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v17, s7 -; MOVREL-NEXT: v_mov_b32_e32 v16, s6 -; MOVREL-NEXT: v_mov_b32_e32 v15, s5 -; MOVREL-NEXT: v_mov_b32_e32 v14, s4 -; MOVREL-NEXT: v_mov_b32_e32 v13, s3 -; MOVREL-NEXT: v_mov_b32_e32 v12, s2 -; MOVREL-NEXT: v_mov_b32_e32 v11, s1 -; MOVREL-NEXT: v_mov_b32_e32 v10, s0 -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_mov_b32_e32 v16, s7 +; MOVREL-NEXT: v_mov_b32_e32 v9, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; MOVREL-NEXT: v_mov_b32_e32 v10, s1 +; MOVREL-NEXT: v_mov_b32_e32 v11, s2 +; MOVREL-NEXT: v_mov_b32_e32 v12, s3 +; MOVREL-NEXT: v_mov_b32_e32 v13, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v8, v9, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; MOVREL-NEXT: v_mov_b32_e32 v14, s5 +; MOVREL-NEXT: v_mov_b32_e32 v15, s6 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB46_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_mov_b32_e32 v2, v10 -; MOVREL-NEXT: v_mov_b32_e32 v3, v11 -; MOVREL-NEXT: v_mov_b32_e32 v4, v12 -; MOVREL-NEXT: v_mov_b32_e32 v5, v13 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v6, v14 -; MOVREL-NEXT: v_mov_b32_e32 v7, v15 -; MOVREL-NEXT: v_mov_b32_e32 v8, v16 -; MOVREL-NEXT: v_mov_b32_e32 v9, v17 -; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB46_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v2 -; MOVREL-NEXT: v_mov_b32_e32 v1, v3 -; MOVREL-NEXT: v_mov_b32_e32 v2, v4 -; MOVREL-NEXT: v_mov_b32_e32 v3, v5 -; MOVREL-NEXT: v_mov_b32_e32 v4, v6 -; MOVREL-NEXT: v_mov_b32_e32 v5, v7 -; MOVREL-NEXT: v_mov_b32_e32 v6, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v11, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v12, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v13, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v14, v0, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v1 +; MOVREL-NEXT: v_mov_b32_e32 v1, v7 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v15, v0, vcc_lo +; MOVREL-NEXT: v_mov_b32_e32 v0, v8 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x float> %vec, float %val, i32 %idx @@ -3610,16 +3629,39 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_v_v_s(<7 x float> %vec, float %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v7f32_v_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v7 -; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v7f32_v_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: v_movreld_b32_e32 v0, v7 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 5 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 6 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x float> %vec, float %val, i32 %idx @@ -3629,64 +3671,39 @@ define amdgpu_ps <7 x float> @dyn_insertelement_v7f32_v_v_v(<7 x float> %vec, float %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v7f32_v_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB48_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v8 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v8 -; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v16, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v7 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB48_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v8 +; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v7f32_v_v_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB48_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v8 -; MOVREL-NEXT: v_mov_b32_e32 v16, v7 -; MOVREL-NEXT: v_mov_b32_e32 v9, v0 -; MOVREL-NEXT: v_mov_b32_e32 v15, v6 -; MOVREL-NEXT: v_mov_b32_e32 v14, v5 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v8 -; MOVREL-NEXT: s_mov_b32 m0, s1 -; MOVREL-NEXT: v_mov_b32_e32 v13, v4 -; MOVREL-NEXT: v_mov_b32_e32 v12, v3 -; MOVREL-NEXT: v_mov_b32_e32 v11, v2 -; MOVREL-NEXT: v_mov_b32_e32 v10, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v9, v7 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB48_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_mov_b32_e32 v0, v9 -; MOVREL-NEXT: v_mov_b32_e32 v1, v10 -; MOVREL-NEXT: v_mov_b32_e32 v2, v11 -; MOVREL-NEXT: v_mov_b32_e32 v3, v12 -; MOVREL-NEXT: v_mov_b32_e32 v4, v13 -; MOVREL-NEXT: v_mov_b32_e32 v5, v14 -; MOVREL-NEXT: v_mov_b32_e32 v6, v15 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v8 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x float> %vec, float %val, i32 %idx @@ -3867,68 +3884,57 @@ ; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v34, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v33, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v32, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v31, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v30, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v29, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v28, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v27, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v26, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v25, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v24, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v23, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v22, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v21, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v20, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v19, s0 -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB51_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 -; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v3, v19 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v20 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v21 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v22 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v23 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v24 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v25 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v26 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v27 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v28 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v29 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v30 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v31 -; GPRIDX-NEXT: v_mov_b32_e32 v16, v32 -; GPRIDX-NEXT: v_mov_b32_e32 v17, v33 -; GPRIDX-NEXT: v_mov_b32_e32 v18, v34 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v4, v1 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB51_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v2 +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v0, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v11, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v13, v0, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v15, v0, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v14, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v1, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v16, v1, s[8:9] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v5 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 ; GPRIDX-NEXT: v_readfirstlane_b32 s3, v6 -; GPRIDX-NEXT: v_readfirstlane_b32 s4, v7 +; GPRIDX-NEXT: v_readfirstlane_b32 s4, v5 ; GPRIDX-NEXT: v_readfirstlane_b32 s5, v8 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v9 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7 ; GPRIDX-NEXT: v_readfirstlane_b32 s7, v10 -; GPRIDX-NEXT: v_readfirstlane_b32 s8, v11 +; GPRIDX-NEXT: v_readfirstlane_b32 s8, v9 ; GPRIDX-NEXT: v_readfirstlane_b32 s9, v12 -; GPRIDX-NEXT: v_readfirstlane_b32 s10, v13 -; GPRIDX-NEXT: v_readfirstlane_b32 s11, v14 -; GPRIDX-NEXT: v_readfirstlane_b32 s12, v15 -; GPRIDX-NEXT: v_readfirstlane_b32 s13, v16 +; GPRIDX-NEXT: v_readfirstlane_b32 s10, v11 +; GPRIDX-NEXT: v_readfirstlane_b32 s11, v13 +; GPRIDX-NEXT: v_readfirstlane_b32 s12, v0 +; GPRIDX-NEXT: v_readfirstlane_b32 s13, v1 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v7f64_s_v_v: @@ -3947,65 +3953,58 @@ ; MOVREL-NEXT: s_mov_b32 s11, s13 ; MOVREL-NEXT: s_mov_b32 s12, s14 ; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: v_mov_b32_e32 v34, s15 -; MOVREL-NEXT: v_mov_b32_e32 v33, s14 -; MOVREL-NEXT: v_mov_b32_e32 v32, s13 -; MOVREL-NEXT: v_mov_b32_e32 v31, s12 -; MOVREL-NEXT: v_mov_b32_e32 v30, s11 -; MOVREL-NEXT: v_mov_b32_e32 v29, s10 -; MOVREL-NEXT: v_mov_b32_e32 v28, s9 -; MOVREL-NEXT: v_mov_b32_e32 v27, s8 -; MOVREL-NEXT: v_mov_b32_e32 v26, s7 -; MOVREL-NEXT: v_mov_b32_e32 v25, s6 -; MOVREL-NEXT: v_mov_b32_e32 v24, s5 -; MOVREL-NEXT: v_mov_b32_e32 v23, s4 -; MOVREL-NEXT: v_mov_b32_e32 v22, s3 -; MOVREL-NEXT: v_mov_b32_e32 v21, s2 -; MOVREL-NEXT: v_mov_b32_e32 v20, s1 -; MOVREL-NEXT: v_mov_b32_e32 v19, s0 -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_mov_b32_e32 v18, s15 +; MOVREL-NEXT: v_mov_b32_e32 v17, s14 +; MOVREL-NEXT: v_mov_b32_e32 v16, s13 +; MOVREL-NEXT: v_mov_b32_e32 v15, s12 +; MOVREL-NEXT: v_mov_b32_e32 v14, s11 +; MOVREL-NEXT: v_mov_b32_e32 v13, s10 +; MOVREL-NEXT: v_mov_b32_e32 v12, s9 +; MOVREL-NEXT: v_mov_b32_e32 v11, s8 +; MOVREL-NEXT: v_mov_b32_e32 v10, s7 +; MOVREL-NEXT: v_mov_b32_e32 v9, s6 +; MOVREL-NEXT: v_mov_b32_e32 v8, s5 +; MOVREL-NEXT: v_mov_b32_e32 v7, s4 +; MOVREL-NEXT: v_mov_b32_e32 v6, s3 +; MOVREL-NEXT: v_mov_b32_e32 v5, s2 +; MOVREL-NEXT: v_mov_b32_e32 v4, s1 +; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB51_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v2 -; MOVREL-NEXT: v_mov_b32_e32 v3, v19 -; MOVREL-NEXT: v_mov_b32_e32 v4, v20 -; MOVREL-NEXT: v_mov_b32_e32 v5, v21 -; MOVREL-NEXT: v_mov_b32_e32 v6, v22 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v2 -; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 -; MOVREL-NEXT: v_mov_b32_e32 v7, v23 -; MOVREL-NEXT: v_mov_b32_e32 v8, v24 -; MOVREL-NEXT: v_mov_b32_e32 v9, v25 -; MOVREL-NEXT: v_mov_b32_e32 v10, v26 -; MOVREL-NEXT: v_mov_b32_e32 v11, v27 -; MOVREL-NEXT: v_mov_b32_e32 v12, v28 -; MOVREL-NEXT: v_mov_b32_e32 v13, v29 -; MOVREL-NEXT: v_mov_b32_e32 v14, v30 -; MOVREL-NEXT: v_mov_b32_e32 v15, v31 -; MOVREL-NEXT: v_mov_b32_e32 v16, v32 -; MOVREL-NEXT: v_mov_b32_e32 v17, v33 -; MOVREL-NEXT: v_mov_b32_e32 v18, v34 -; MOVREL-NEXT: v_movreld_b32_e32 v3, v0 -; MOVREL-NEXT: v_movreld_b32_e32 v4, v1 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB51_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_readfirstlane_b32 s0, v3 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v4 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v5 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v12, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo ; MOVREL-NEXT: v_readfirstlane_b32 s3, v6 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1 +; MOVREL-NEXT: v_readfirstlane_b32 s0, v3 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v4 ; MOVREL-NEXT: v_readfirstlane_b32 s4, v7 ; MOVREL-NEXT: v_readfirstlane_b32 s5, v8 ; MOVREL-NEXT: v_readfirstlane_b32 s6, v9 ; MOVREL-NEXT: v_readfirstlane_b32 s7, v10 ; MOVREL-NEXT: v_readfirstlane_b32 s8, v11 -; MOVREL-NEXT: v_readfirstlane_b32 s9, v12 -; MOVREL-NEXT: v_readfirstlane_b32 s10, v13 -; MOVREL-NEXT: v_readfirstlane_b32 s11, v14 -; MOVREL-NEXT: v_readfirstlane_b32 s12, v15 -; MOVREL-NEXT: v_readfirstlane_b32 s13, v16 +; MOVREL-NEXT: v_readfirstlane_b32 s9, v2 +; MOVREL-NEXT: v_readfirstlane_b32 s10, v12 +; MOVREL-NEXT: v_readfirstlane_b32 s11, v13 +; MOVREL-NEXT: v_readfirstlane_b32 s12, v0 +; MOVREL-NEXT: v_readfirstlane_b32 s13, v1 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x double> %vec, double %val, i32 %idx @@ -4067,99 +4066,83 @@ define amdgpu_ps <7 x double> @dyn_insertelement_v7f64_v_v_v(<7 x double> %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v7f64_v_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB53_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v16 -; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v16 -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v32, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v31, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v30, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v29, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v28, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v27, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v26, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v25, v8 -; GPRIDX-NEXT: v_mov_b32_e32 v24, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v23, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v22, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v21, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v20, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v19, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v18, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v17, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v17, v14 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v18, v15 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB53_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_readfirstlane_b32 s0, v17 -; GPRIDX-NEXT: v_readfirstlane_b32 s1, v18 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v19 -; GPRIDX-NEXT: v_readfirstlane_b32 s3, v20 -; GPRIDX-NEXT: v_readfirstlane_b32 s4, v21 -; GPRIDX-NEXT: v_readfirstlane_b32 s5, v22 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v23 -; GPRIDX-NEXT: v_readfirstlane_b32 s7, v24 -; GPRIDX-NEXT: v_readfirstlane_b32 s8, v25 -; GPRIDX-NEXT: v_readfirstlane_b32 s9, v26 -; GPRIDX-NEXT: v_readfirstlane_b32 s10, v27 -; GPRIDX-NEXT: v_readfirstlane_b32 s11, v28 -; GPRIDX-NEXT: v_readfirstlane_b32 s12, v29 -; GPRIDX-NEXT: v_readfirstlane_b32 s13, v30 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 2, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 4, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 5, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 6, v16 +; GPRIDX-NEXT: v_cndmask_b32_e64 v12, v12, v14, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v13, v13, v15, s[10:11] +; GPRIDX-NEXT: v_cndmask_b32_e64 v10, v10, v14, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v11, v11, v15, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v14, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v15, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v15, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc +; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 +; GPRIDX-NEXT: v_readfirstlane_b32 s1, v1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 +; GPRIDX-NEXT: v_readfirstlane_b32 s3, v3 +; GPRIDX-NEXT: v_readfirstlane_b32 s4, v4 +; GPRIDX-NEXT: v_readfirstlane_b32 s5, v5 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v6 +; GPRIDX-NEXT: v_readfirstlane_b32 s7, v7 +; GPRIDX-NEXT: v_readfirstlane_b32 s8, v8 +; GPRIDX-NEXT: v_readfirstlane_b32 s9, v9 +; GPRIDX-NEXT: v_readfirstlane_b32 s10, v10 +; GPRIDX-NEXT: v_readfirstlane_b32 s11, v11 +; GPRIDX-NEXT: v_readfirstlane_b32 s12, v12 +; GPRIDX-NEXT: v_readfirstlane_b32 s13, v13 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v7f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v16 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v16 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v16 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v16 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 6, v16 +; MOVREL-NEXT: v_mov_b32_e32 v17, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 +; MOVREL-NEXT: v_mov_b32_e32 v18, v3 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v14, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5 +; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2 +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v17, v14, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3 +; MOVREL-NEXT: v_cndmask_b32_e64 v3, v18, v15, s0 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v15, s4 +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v14, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v13, v13, v15, s5 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v15, s1 +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v15, vcc_lo +; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 +; MOVREL-NEXT: v_readfirstlane_b32 s2, v2 +; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 +; MOVREL-NEXT: v_readfirstlane_b32 s4, v4 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 +; MOVREL-NEXT: v_readfirstlane_b32 s5, v5 +; MOVREL-NEXT: v_readfirstlane_b32 s6, v6 +; MOVREL-NEXT: v_readfirstlane_b32 s7, v7 +; MOVREL-NEXT: v_readfirstlane_b32 s8, v8 +; MOVREL-NEXT: v_readfirstlane_b32 s9, v9 +; MOVREL-NEXT: v_readfirstlane_b32 s10, v10 +; MOVREL-NEXT: v_readfirstlane_b32 s11, v11 +; MOVREL-NEXT: v_readfirstlane_b32 s12, v12 +; MOVREL-NEXT: v_readfirstlane_b32 s13, v13 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB53_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v16 -; MOVREL-NEXT: v_mov_b32_e32 v32, v15 -; MOVREL-NEXT: v_mov_b32_e32 v17, v0 -; MOVREL-NEXT: v_mov_b32_e32 v31, v14 -; MOVREL-NEXT: v_mov_b32_e32 v30, v13 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v16 -; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 -; MOVREL-NEXT: v_mov_b32_e32 v29, v12 -; MOVREL-NEXT: v_mov_b32_e32 v28, v11 -; MOVREL-NEXT: v_mov_b32_e32 v27, v10 -; MOVREL-NEXT: v_mov_b32_e32 v26, v9 -; MOVREL-NEXT: v_mov_b32_e32 v25, v8 -; MOVREL-NEXT: v_mov_b32_e32 v24, v7 -; MOVREL-NEXT: v_mov_b32_e32 v23, v6 -; MOVREL-NEXT: v_mov_b32_e32 v22, v5 -; MOVREL-NEXT: v_mov_b32_e32 v21, v4 -; MOVREL-NEXT: v_mov_b32_e32 v20, v3 -; MOVREL-NEXT: v_mov_b32_e32 v19, v2 -; MOVREL-NEXT: v_mov_b32_e32 v18, v1 -; MOVREL-NEXT: v_movreld_b32_e32 v17, v14 -; MOVREL-NEXT: v_movreld_b32_e32 v18, v15 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB53_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_readfirstlane_b32 s0, v17 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v18 -; MOVREL-NEXT: v_readfirstlane_b32 s2, v19 -; MOVREL-NEXT: v_readfirstlane_b32 s3, v20 -; MOVREL-NEXT: v_readfirstlane_b32 s4, v21 -; MOVREL-NEXT: v_readfirstlane_b32 s5, v22 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v23 -; MOVREL-NEXT: v_readfirstlane_b32 s7, v24 -; MOVREL-NEXT: v_readfirstlane_b32 s8, v25 -; MOVREL-NEXT: v_readfirstlane_b32 s9, v26 -; MOVREL-NEXT: v_readfirstlane_b32 s10, v27 -; MOVREL-NEXT: v_readfirstlane_b32 s11, v28 -; MOVREL-NEXT: v_readfirstlane_b32 s12, v29 -; MOVREL-NEXT: v_readfirstlane_b32 s13, v30 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x double> %vec, double %val, i32 %idx @@ -4169,36 +4152,31 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_s_s_s(<5 x double> inreg %vec, double inreg %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5f64_s_s_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 m0, s14 -; GPRIDX-NEXT: s_nop 0 -; GPRIDX-NEXT: s_movreld_b64 s[0:1], s[12:13] +; GPRIDX-NEXT: s_cmp_eq_u32 s14, 0 +; GPRIDX-NEXT: s_cselect_b64 s[0:1], s[12:13], s[2:3] +; GPRIDX-NEXT: s_cmp_eq_u32 s14, 1 +; GPRIDX-NEXT: s_cselect_b64 s[2:3], s[12:13], s[4:5] +; GPRIDX-NEXT: s_cmp_eq_u32 s14, 2 +; GPRIDX-NEXT: s_cselect_b64 s[4:5], s[12:13], s[6:7] +; GPRIDX-NEXT: s_cmp_eq_u32 s14, 3 +; GPRIDX-NEXT: s_cselect_b64 s[6:7], s[12:13], s[8:9] +; GPRIDX-NEXT: s_cmp_eq_u32 s14, 4 +; GPRIDX-NEXT: s_cselect_b64 s[8:9], s[12:13], s[10:11] ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v5f64_s_s_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 m0, s14 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_movreld_b64 s[0:1], s[12:13] +; MOVREL-NEXT: s_cmp_eq_u32 s14, 0 ; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_cselect_b64 s[0:1], s[12:13], s[2:3] +; MOVREL-NEXT: s_cmp_eq_u32 s14, 1 +; MOVREL-NEXT: s_cselect_b64 s[2:3], s[12:13], s[4:5] +; MOVREL-NEXT: s_cmp_eq_u32 s14, 2 +; MOVREL-NEXT: s_cselect_b64 s[4:5], s[12:13], s[6:7] +; MOVREL-NEXT: s_cmp_eq_u32 s14, 3 +; MOVREL-NEXT: s_cselect_b64 s[6:7], s[12:13], s[8:9] +; MOVREL-NEXT: s_cmp_eq_u32 s14, 4 +; MOVREL-NEXT: s_cselect_b64 s[8:9], s[12:13], s[10:11] ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <5 x double> %vec, double %val, i32 %idx @@ -4234,11 +4212,21 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v4, s2 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 ; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 -; GPRIDX-NEXT: s_lshl_b32 s0, s12, 1 -; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v2, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v1 -; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s12, 0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 +; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v0, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v0, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v2 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v3 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v4 @@ -4247,8 +4235,8 @@ ; GPRIDX-NEXT: v_readfirstlane_b32 s5, v7 ; GPRIDX-NEXT: v_readfirstlane_b32 s6, v8 ; GPRIDX-NEXT: v_readfirstlane_b32 s7, v9 -; GPRIDX-NEXT: v_readfirstlane_b32 s8, v10 -; GPRIDX-NEXT: v_readfirstlane_b32 s9, v11 +; GPRIDX-NEXT: v_readfirstlane_b32 s8, v0 +; GPRIDX-NEXT: v_readfirstlane_b32 s9, v1 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v5f64_s_v_s: @@ -4263,36 +4251,48 @@ ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: v_mov_b32_e32 v17, s15 -; MOVREL-NEXT: v_mov_b32_e32 v2, s0 -; MOVREL-NEXT: s_lshl_b32 m0, s12, 1 -; MOVREL-NEXT: v_mov_b32_e32 v16, s14 -; MOVREL-NEXT: v_mov_b32_e32 v15, s13 -; MOVREL-NEXT: v_mov_b32_e32 v14, s12 -; MOVREL-NEXT: v_mov_b32_e32 v13, s11 -; MOVREL-NEXT: v_mov_b32_e32 v12, s10 -; MOVREL-NEXT: v_mov_b32_e32 v11, s9 -; MOVREL-NEXT: v_mov_b32_e32 v10, s8 -; MOVREL-NEXT: v_mov_b32_e32 v9, s7 -; MOVREL-NEXT: v_mov_b32_e32 v8, s6 -; MOVREL-NEXT: v_mov_b32_e32 v7, s5 -; MOVREL-NEXT: v_mov_b32_e32 v6, s4 -; MOVREL-NEXT: v_mov_b32_e32 v5, s3 -; MOVREL-NEXT: v_mov_b32_e32 v4, s2 -; MOVREL-NEXT: v_mov_b32_e32 v3, s1 -; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 -; MOVREL-NEXT: v_movreld_b32_e32 v3, v1 -; MOVREL-NEXT: v_readfirstlane_b32 s0, v2 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v3 +; MOVREL-NEXT: v_mov_b32_e32 v18, s15 +; MOVREL-NEXT: v_mov_b32_e32 v17, s14 +; MOVREL-NEXT: v_mov_b32_e32 v16, s13 +; MOVREL-NEXT: v_mov_b32_e32 v15, s12 +; MOVREL-NEXT: v_mov_b32_e32 v14, s11 +; MOVREL-NEXT: v_mov_b32_e32 v13, s10 +; MOVREL-NEXT: v_mov_b32_e32 v12, s9 +; MOVREL-NEXT: v_mov_b32_e32 v11, s8 +; MOVREL-NEXT: v_mov_b32_e32 v10, s7 +; MOVREL-NEXT: v_mov_b32_e32 v9, s6 +; MOVREL-NEXT: v_mov_b32_e32 v8, s5 +; MOVREL-NEXT: v_mov_b32_e32 v7, s4 +; MOVREL-NEXT: v_mov_b32_e32 v6, s3 +; MOVREL-NEXT: v_mov_b32_e32 v5, s2 +; MOVREL-NEXT: v_mov_b32_e32 v4, s1 +; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v3, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v6, v1, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v4 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v5 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v7, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v10, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v0, v11, v0, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s1 +; MOVREL-NEXT: v_readfirstlane_b32 s0, v2 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v3 ; MOVREL-NEXT: v_readfirstlane_b32 s4, v6 ; MOVREL-NEXT: v_readfirstlane_b32 s5, v7 ; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 ; MOVREL-NEXT: v_readfirstlane_b32 s7, v9 -; MOVREL-NEXT: v_readfirstlane_b32 s8, v10 -; MOVREL-NEXT: v_readfirstlane_b32 s9, v11 -; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_readfirstlane_b32 s8, v0 +; MOVREL-NEXT: v_readfirstlane_b32 s9, v1 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <5 x double> %vec, double %val, i32 %idx @@ -4312,64 +4312,47 @@ ; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v34, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v33, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v32, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v31, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v30, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v29, s10 -; GPRIDX-NEXT: v_mov_b32_e32 v28, s9 -; GPRIDX-NEXT: v_mov_b32_e32 v27, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v26, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v25, s6 -; GPRIDX-NEXT: v_mov_b32_e32 v24, s5 -; GPRIDX-NEXT: v_mov_b32_e32 v23, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v22, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v21, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v20, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v19, s0 -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB56_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 -; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v3, v19 -; GPRIDX-NEXT: v_mov_b32_e32 v4, v20 -; GPRIDX-NEXT: v_mov_b32_e32 v5, v21 -; GPRIDX-NEXT: v_mov_b32_e32 v6, v22 -; GPRIDX-NEXT: v_mov_b32_e32 v7, v23 -; GPRIDX-NEXT: v_mov_b32_e32 v8, v24 -; GPRIDX-NEXT: v_mov_b32_e32 v9, v25 -; GPRIDX-NEXT: v_mov_b32_e32 v10, v26 -; GPRIDX-NEXT: v_mov_b32_e32 v11, v27 -; GPRIDX-NEXT: v_mov_b32_e32 v12, v28 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v29 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v30 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v31 -; GPRIDX-NEXT: v_mov_b32_e32 v16, v32 -; GPRIDX-NEXT: v_mov_b32_e32 v17, v33 -; GPRIDX-NEXT: v_mov_b32_e32 v18, v34 -; GPRIDX-NEXT: v_mov_b32_e32 v3, v0 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v4, v1 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB56_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v5, v0, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v9, v0, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v0, v11, v0, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v1, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v10, v1, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v3 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v4 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v5 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 ; GPRIDX-NEXT: v_readfirstlane_b32 s3, v6 -; GPRIDX-NEXT: v_readfirstlane_b32 s4, v7 +; GPRIDX-NEXT: v_readfirstlane_b32 s4, v5 ; GPRIDX-NEXT: v_readfirstlane_b32 s5, v8 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v9 -; GPRIDX-NEXT: v_readfirstlane_b32 s7, v10 -; GPRIDX-NEXT: v_readfirstlane_b32 s8, v11 -; GPRIDX-NEXT: v_readfirstlane_b32 s9, v12 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v7 +; GPRIDX-NEXT: v_readfirstlane_b32 s7, v9 +; GPRIDX-NEXT: v_readfirstlane_b32 s8, v0 +; GPRIDX-NEXT: v_readfirstlane_b32 s9, v1 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v5f64_s_v_v: @@ -4384,61 +4367,48 @@ ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: v_mov_b32_e32 v34, s15 -; MOVREL-NEXT: v_mov_b32_e32 v33, s14 -; MOVREL-NEXT: v_mov_b32_e32 v32, s13 -; MOVREL-NEXT: v_mov_b32_e32 v31, s12 -; MOVREL-NEXT: v_mov_b32_e32 v30, s11 -; MOVREL-NEXT: v_mov_b32_e32 v29, s10 -; MOVREL-NEXT: v_mov_b32_e32 v28, s9 -; MOVREL-NEXT: v_mov_b32_e32 v27, s8 -; MOVREL-NEXT: v_mov_b32_e32 v26, s7 -; MOVREL-NEXT: v_mov_b32_e32 v25, s6 -; MOVREL-NEXT: v_mov_b32_e32 v24, s5 -; MOVREL-NEXT: v_mov_b32_e32 v23, s4 -; MOVREL-NEXT: v_mov_b32_e32 v22, s3 -; MOVREL-NEXT: v_mov_b32_e32 v21, s2 -; MOVREL-NEXT: v_mov_b32_e32 v20, s1 -; MOVREL-NEXT: v_mov_b32_e32 v19, s0 -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_mov_b32_e32 v18, s15 +; MOVREL-NEXT: v_mov_b32_e32 v17, s14 +; MOVREL-NEXT: v_mov_b32_e32 v16, s13 +; MOVREL-NEXT: v_mov_b32_e32 v15, s12 +; MOVREL-NEXT: v_mov_b32_e32 v14, s11 +; MOVREL-NEXT: v_mov_b32_e32 v13, s10 +; MOVREL-NEXT: v_mov_b32_e32 v12, s9 +; MOVREL-NEXT: v_mov_b32_e32 v11, s8 +; MOVREL-NEXT: v_mov_b32_e32 v10, s7 +; MOVREL-NEXT: v_mov_b32_e32 v9, s6 +; MOVREL-NEXT: v_mov_b32_e32 v8, s5 +; MOVREL-NEXT: v_mov_b32_e32 v7, s4 +; MOVREL-NEXT: v_mov_b32_e32 v6, s3 +; MOVREL-NEXT: v_mov_b32_e32 v5, s2 +; MOVREL-NEXT: v_mov_b32_e32 v4, s1 +; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB56_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v2 -; MOVREL-NEXT: v_mov_b32_e32 v3, v19 -; MOVREL-NEXT: v_mov_b32_e32 v4, v20 -; MOVREL-NEXT: v_mov_b32_e32 v5, v21 -; MOVREL-NEXT: v_mov_b32_e32 v6, v22 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v2 -; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 -; MOVREL-NEXT: v_mov_b32_e32 v7, v23 -; MOVREL-NEXT: v_mov_b32_e32 v8, v24 -; MOVREL-NEXT: v_mov_b32_e32 v9, v25 -; MOVREL-NEXT: v_mov_b32_e32 v10, v26 -; MOVREL-NEXT: v_mov_b32_e32 v11, v27 -; MOVREL-NEXT: v_mov_b32_e32 v12, v28 -; MOVREL-NEXT: v_mov_b32_e32 v13, v29 -; MOVREL-NEXT: v_mov_b32_e32 v14, v30 -; MOVREL-NEXT: v_mov_b32_e32 v15, v31 -; MOVREL-NEXT: v_mov_b32_e32 v16, v32 -; MOVREL-NEXT: v_mov_b32_e32 v17, v33 -; MOVREL-NEXT: v_mov_b32_e32 v18, v34 -; MOVREL-NEXT: v_movreld_b32_e32 v3, v0 -; MOVREL-NEXT: v_movreld_b32_e32 v4, v1 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB56_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_readfirstlane_b32 s0, v3 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v4 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v5 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v6 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v10, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v0, v11, v0, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s1 +; MOVREL-NEXT: v_readfirstlane_b32 s0, v3 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v4 ; MOVREL-NEXT: v_readfirstlane_b32 s4, v7 -; MOVREL-NEXT: v_readfirstlane_b32 s5, v8 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v9 -; MOVREL-NEXT: v_readfirstlane_b32 s7, v10 -; MOVREL-NEXT: v_readfirstlane_b32 s8, v11 -; MOVREL-NEXT: v_readfirstlane_b32 s9, v12 +; MOVREL-NEXT: v_readfirstlane_b32 s5, v2 +; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 +; MOVREL-NEXT: v_readfirstlane_b32 s7, v9 +; MOVREL-NEXT: v_readfirstlane_b32 s8, v0 +; MOVREL-NEXT: v_readfirstlane_b32 s9, v1 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <5 x double> %vec, double %val, i32 %idx @@ -4448,12 +4418,21 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_s(<5 x double> %vec, double %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5f64_v_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1 -; GPRIDX-NEXT: v_mov_b32_e32 v16, v11 -; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v1, v16 -; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 1 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], s2, 2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], s2, 3 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], s2, 4 +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[8:9] +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[6:7] ; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s1, v1 ; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 @@ -4468,19 +4447,32 @@ ; ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: v_mov_b32_e32 v16, v11 -; MOVREL-NEXT: s_lshl_b32 m0, s2, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; MOVREL-NEXT: v_mov_b32_e32 v13, v2 +; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: v_movreld_b32_e32 v0, v10 -; MOVREL-NEXT: v_movreld_b32_e32 v1, v16 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_readfirstlane_b32 s2, v2 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 3 ; MOVREL-NEXT: v_readfirstlane_b32 s4, v4 ; MOVREL-NEXT: v_readfirstlane_b32 s5, v5 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 4 +; MOVREL-NEXT: v_readfirstlane_b32 s2, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s6, v6 ; MOVREL-NEXT: v_readfirstlane_b32 s7, v7 +; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo ; MOVREL-NEXT: v_readfirstlane_b32 s8, v8 ; MOVREL-NEXT: v_readfirstlane_b32 s9, v9 ; MOVREL-NEXT: ; return to shader part epilog @@ -4492,91 +4484,63 @@ define amdgpu_ps <5 x double> @dyn_insertelement_v5f64_v_v_v(<5 x double> %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v5f64_v_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b64 s[0:1], exec -; GPRIDX-NEXT: BB58_1: ; =>This Inner Loop Header: Depth=1 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v12 -; GPRIDX-NEXT: s_lshl_b32 s3, s2, 1 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v12 -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v28, v15 -; GPRIDX-NEXT: v_mov_b32_e32 v27, v14 -; GPRIDX-NEXT: v_mov_b32_e32 v26, v13 -; GPRIDX-NEXT: v_mov_b32_e32 v25, v12 -; GPRIDX-NEXT: v_mov_b32_e32 v24, v11 -; GPRIDX-NEXT: v_mov_b32_e32 v23, v10 -; GPRIDX-NEXT: v_mov_b32_e32 v22, v9 -; GPRIDX-NEXT: v_mov_b32_e32 v21, v8 -; GPRIDX-NEXT: v_mov_b32_e32 v20, v7 -; GPRIDX-NEXT: v_mov_b32_e32 v19, v6 -; GPRIDX-NEXT: v_mov_b32_e32 v18, v5 -; GPRIDX-NEXT: v_mov_b32_e32 v17, v4 -; GPRIDX-NEXT: v_mov_b32_e32 v16, v3 -; GPRIDX-NEXT: v_mov_b32_e32 v15, v2 -; GPRIDX-NEXT: v_mov_b32_e32 v14, v1 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v0 -; GPRIDX-NEXT: v_mov_b32_e32 v13, v10 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) -; GPRIDX-NEXT: v_mov_b32_e32 v14, v11 -; GPRIDX-NEXT: s_set_gpr_idx_off -; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc -; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc -; GPRIDX-NEXT: s_cbranch_execnz BB58_1 -; GPRIDX-NEXT: ; %bb.2: -; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] -; GPRIDX-NEXT: v_readfirstlane_b32 s0, v13 -; GPRIDX-NEXT: v_readfirstlane_b32 s1, v14 -; GPRIDX-NEXT: v_readfirstlane_b32 s2, v15 -; GPRIDX-NEXT: v_readfirstlane_b32 s3, v16 -; GPRIDX-NEXT: v_readfirstlane_b32 s4, v17 -; GPRIDX-NEXT: v_readfirstlane_b32 s5, v18 -; GPRIDX-NEXT: v_readfirstlane_b32 s6, v19 -; GPRIDX-NEXT: v_readfirstlane_b32 s7, v20 -; GPRIDX-NEXT: v_readfirstlane_b32 s8, v21 -; GPRIDX-NEXT: v_readfirstlane_b32 s9, v22 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v12 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v12 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 2, v12 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 3, v12 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 4, v12 +; GPRIDX-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[6:7] +; GPRIDX-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5] +; GPRIDX-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] +; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] +; GPRIDX-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GPRIDX-NEXT: v_readfirstlane_b32 s0, v0 +; GPRIDX-NEXT: v_readfirstlane_b32 s1, v1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 +; GPRIDX-NEXT: v_readfirstlane_b32 s3, v3 +; GPRIDX-NEXT: v_readfirstlane_b32 s4, v4 +; GPRIDX-NEXT: v_readfirstlane_b32 s5, v5 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v6 +; GPRIDX-NEXT: v_readfirstlane_b32 s7, v7 +; GPRIDX-NEXT: v_readfirstlane_b32 s8, v8 +; GPRIDX-NEXT: v_readfirstlane_b32 s9, v9 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 +; MOVREL-NEXT: v_mov_b32_e32 v13, v2 +; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: BB58_1: ; =>This Inner Loop Header: Depth=1 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v12 -; MOVREL-NEXT: v_mov_b32_e32 v28, v15 -; MOVREL-NEXT: v_mov_b32_e32 v27, v14 -; MOVREL-NEXT: v_mov_b32_e32 v26, v13 -; MOVREL-NEXT: v_mov_b32_e32 v25, v12 -; MOVREL-NEXT: v_mov_b32_e32 v24, v11 -; MOVREL-NEXT: v_mov_b32_e32 v23, v10 -; MOVREL-NEXT: v_mov_b32_e32 v22, v9 -; MOVREL-NEXT: v_mov_b32_e32 v21, v8 -; MOVREL-NEXT: v_mov_b32_e32 v20, v7 -; MOVREL-NEXT: v_mov_b32_e32 v19, v6 -; MOVREL-NEXT: v_mov_b32_e32 v18, v5 -; MOVREL-NEXT: v_mov_b32_e32 v17, v4 -; MOVREL-NEXT: v_mov_b32_e32 v16, v3 -; MOVREL-NEXT: v_mov_b32_e32 v15, v2 -; MOVREL-NEXT: v_mov_b32_e32 v14, v1 -; MOVREL-NEXT: v_mov_b32_e32 v13, v0 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v12 -; MOVREL-NEXT: s_lshl_b32 m0, s1, 1 -; MOVREL-NEXT: v_movreld_b32_e32 v13, v10 -; MOVREL-NEXT: v_movreld_b32_e32 v14, v11 -; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo -; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo -; MOVREL-NEXT: s_cbranch_execnz BB58_1 -; MOVREL-NEXT: ; %bb.2: -; MOVREL-NEXT: s_mov_b32 exec_lo, s0 -; MOVREL-NEXT: v_readfirstlane_b32 s0, v13 -; MOVREL-NEXT: v_readfirstlane_b32 s1, v14 -; MOVREL-NEXT: v_readfirstlane_b32 s2, v15 -; MOVREL-NEXT: v_readfirstlane_b32 s3, v16 -; MOVREL-NEXT: v_readfirstlane_b32 s4, v17 -; MOVREL-NEXT: v_readfirstlane_b32 s5, v18 -; MOVREL-NEXT: v_readfirstlane_b32 s6, v19 -; MOVREL-NEXT: v_readfirstlane_b32 s7, v20 -; MOVREL-NEXT: v_readfirstlane_b32 s8, v21 -; MOVREL-NEXT: v_readfirstlane_b32 s9, v22 +; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 +; MOVREL-NEXT: v_readfirstlane_b32 s2, v2 +; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12 +; MOVREL-NEXT: v_readfirstlane_b32 s4, v4 +; MOVREL-NEXT: v_readfirstlane_b32 s5, v5 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v11, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v12 +; MOVREL-NEXT: v_readfirstlane_b32 s6, v6 +; MOVREL-NEXT: v_readfirstlane_b32 s7, v7 +; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc_lo +; MOVREL-NEXT: v_readfirstlane_b32 s8, v8 +; MOVREL-NEXT: v_readfirstlane_b32 s9, v9 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <5 x double> %vec, double %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir @@ -14,8 +14,21 @@ ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr5 - ; CHECK: [[IVEC:%[0-9]+]]:sgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[IVEC]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP]](s32), [[COPY1]], [[UV]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]] + ; CHECK: [[SELECT1:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP1]](s32), [[COPY1]], [[UV1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C2]] + ; CHECK: [[SELECT2:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP2]](s32), [[COPY1]], [[UV2]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:sgpr(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C3]] + ; CHECK: [[SELECT3:%[0-9]+]]:sgpr(s32) = G_SELECT [[ICMP3]](s32), [[COPY1]], [[UV3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32) + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 %2:_(s32) = COPY $sgpr5 @@ -33,10 +46,24 @@ ; CHECK-LABEL: name: insert_vector_elt_v4i32_v_s_s ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $sgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr1 - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[UV]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[COPY1]], [[UV1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C2]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[COPY1]], [[UV2]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C3]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[COPY1]], [[UV3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr0 %2:_(s32) = COPY $sgpr1 @@ -57,8 +84,22 @@ ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) + ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](<4 x s32>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY4]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[UV]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY4]](s32), [[C1]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[COPY1]], [[UV1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY4]](s32), [[C2]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[COPY1]], [[UV2]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY4]](s32), [[C3]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[COPY1]], [[UV3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 %2:_(s32) = COPY $sgpr4 @@ -76,30 +117,26 @@ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0 ; CHECK-LABEL: name: insert_vector_elt_v4i32_s_s_v - ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](<4 x s32>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[UV]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[COPY1]], [[UV1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C2]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[COPY1]], [[UV2]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C3]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[COPY1]], [[UV3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 %2:_(s32) = COPY $vgpr0 @@ -117,30 +154,26 @@ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0, $vgpr1 ; CHECK-LABEL: name: insert_vector_elt_v4i32_s_v_v - ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0, $vgpr1 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %10, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](<4 x s32>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[UV]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[COPY1]], [[UV1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C2]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[COPY1]], [[UV2]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C3]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[COPY1]], [[UV3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 %2:_(s32) = COPY $vgpr1 @@ -158,29 +191,25 @@ liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr0 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_s_v - ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 - ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $sgpr4 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[UV]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[COPY1]], [[UV1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C2]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[COPY1]], [[UV2]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C3]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[COPY1]], [[UV3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr4 %2:_(s32) = COPY $vgpr0 @@ -202,8 +231,22 @@ ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) + ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY2]](s32) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[UV]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[COPY1]], [[UV1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C2]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[COPY1]], [[UV2]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C3]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[COPY1]], [[UV3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr0 %2:_(s32) = COPY $sgpr0 @@ -221,29 +264,25 @@ liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_v_v - ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF1]], %bb.0, %9, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32), [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[COPY1]], [[UV]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[COPY1]], [[UV1]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C2]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[COPY1]], [[UV2]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C3]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[COPY1]], [[UV3]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<4 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 %2:_(s32) = COPY $vgpr5 @@ -345,43 +384,48 @@ liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $vgpr0 ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_s_v - ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<8 x s64>) = COPY [[COPY]](<8 x s64>) - ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY3]](<8 x s64>) - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %24, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %9(s32), %bb.1 - ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %10(s32), %bb.1 - ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1 - ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %12(<16 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec - ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) - ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32), [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV]], [[UV2]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV3]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV]], [[UV4]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV1]], [[UV5]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C2]] + ; CHECK: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV]], [[UV6]] + ; CHECK: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV1]], [[UV7]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C3]] + ; CHECK: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV]], [[UV8]] + ; CHECK: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV1]], [[UV9]] + ; CHECK: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C4]] + ; CHECK: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV]], [[UV10]] + ; CHECK: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV1]], [[UV11]] + ; CHECK: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C5]] + ; CHECK: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV]], [[UV12]] + ; CHECK: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV1]], [[UV13]] + ; CHECK: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C6]] + ; CHECK: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV]], [[UV14]] + ; CHECK: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV1]], [[UV15]] + ; CHECK: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C7]] + ; CHECK: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV]], [[UV16]] + ; CHECK: [[SELECT15:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV1]], [[UV17]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<16 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32), [[SELECT4]](s32), [[SELECT5]](s32), [[SELECT6]](s32), [[SELECT7]](s32), [[SELECT8]](s32), [[SELECT9]](s32), [[SELECT10]](s32), [[SELECT11]](s32), [[SELECT12]](s32), [[SELECT13]](s32), [[SELECT14]](s32), [[SELECT15]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[BUILD_VECTOR]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<8 x s64>) %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s64) = COPY $sgpr16_sgpr17 %2:_(s32) = COPY $vgpr0 @@ -399,43 +443,48 @@ liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $vgpr2 ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_v_v - ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $vgpr2 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr2 ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<8 x s64>) = COPY [[COPY]](<8 x s64>) ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY3]](<8 x s64>) - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %24, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %9(s32), %bb.1 - ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %10(s32), %bb.1 - ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1 - ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %12(<16 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec - ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) - ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32), [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY3]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV]], [[UV2]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV3]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV]], [[UV4]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV1]], [[UV5]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C2]] + ; CHECK: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV]], [[UV6]] + ; CHECK: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV1]], [[UV7]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C3]] + ; CHECK: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV]], [[UV8]] + ; CHECK: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV1]], [[UV9]] + ; CHECK: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C4]] + ; CHECK: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV]], [[UV10]] + ; CHECK: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV1]], [[UV11]] + ; CHECK: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C5]] + ; CHECK: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV]], [[UV12]] + ; CHECK: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV1]], [[UV13]] + ; CHECK: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C6]] + ; CHECK: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV]], [[UV14]] + ; CHECK: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV1]], [[UV15]] + ; CHECK: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C7]] + ; CHECK: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV]], [[UV16]] + ; CHECK: [[SELECT15:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV1]], [[UV17]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<16 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32), [[SELECT4]](s32), [[SELECT5]](s32), [[SELECT6]](s32), [[SELECT7]](s32), [[SELECT8]](s32), [[SELECT9]](s32), [[SELECT10]](s32), [[SELECT11]](s32), [[SELECT12]](s32), [[SELECT13]](s32), [[SELECT14]](s32), [[SELECT15]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[BUILD_VECTOR]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<8 x s64>) %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 %1:_(s64) = COPY $vgpr0_vgpr1 %2:_(s32) = COPY $vgpr2 @@ -483,42 +532,47 @@ liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0_sgpr1, $vgpr16 ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_s_v - ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0_sgpr1, $vgpr16 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 - ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %23, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.1 - ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.1 - ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.1 - ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec - ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) - ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr16 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32), [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV]], [[UV2]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV3]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV]], [[UV4]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV1]], [[UV5]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C2]] + ; CHECK: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV]], [[UV6]] + ; CHECK: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV1]], [[UV7]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C3]] + ; CHECK: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV]], [[UV8]] + ; CHECK: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV1]], [[UV9]] + ; CHECK: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C4]] + ; CHECK: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV]], [[UV10]] + ; CHECK: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV1]], [[UV11]] + ; CHECK: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C5]] + ; CHECK: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV]], [[UV12]] + ; CHECK: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV1]], [[UV13]] + ; CHECK: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C6]] + ; CHECK: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV]], [[UV14]] + ; CHECK: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV1]], [[UV15]] + ; CHECK: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C7]] + ; CHECK: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV]], [[UV16]] + ; CHECK: [[SELECT15:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV1]], [[UV17]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<16 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32), [[SELECT4]](s32), [[SELECT5]](s32), [[SELECT6]](s32), [[SELECT7]](s32), [[SELECT8]](s32), [[SELECT9]](s32), [[SELECT10]](s32), [[SELECT11]](s32), [[SELECT12]](s32), [[SELECT13]](s32), [[SELECT14]](s32), [[SELECT15]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[BUILD_VECTOR]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<8 x s64>) %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s64) = COPY $sgpr0_sgpr1 %2:_(s32) = COPY $vgpr16 @@ -536,42 +590,47 @@ liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $vgpr18 ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_v_v - ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $vgpr18 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr16_vgpr17 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr18 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr18 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: .1: - ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %23, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.1 - ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.1 - ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.1 - ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec - ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) - ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec - ; CHECK: .2: - ; CHECK: successors: %bb.3(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: .3: - ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32), [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV]], [[UV2]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV3]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV]], [[UV4]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV1]], [[UV5]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C2]] + ; CHECK: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV]], [[UV6]] + ; CHECK: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV1]], [[UV7]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C3]] + ; CHECK: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV]], [[UV8]] + ; CHECK: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV1]], [[UV9]] + ; CHECK: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C4]] + ; CHECK: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV]], [[UV10]] + ; CHECK: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV1]], [[UV11]] + ; CHECK: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C5]] + ; CHECK: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV]], [[UV12]] + ; CHECK: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV1]], [[UV13]] + ; CHECK: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C6]] + ; CHECK: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV]], [[UV14]] + ; CHECK: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV1]], [[UV15]] + ; CHECK: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C7]] + ; CHECK: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV]], [[UV16]] + ; CHECK: [[SELECT15:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV1]], [[UV17]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<16 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32), [[SELECT4]](s32), [[SELECT5]](s32), [[SELECT6]](s32), [[SELECT7]](s32), [[SELECT8]](s32), [[SELECT9]](s32), [[SELECT10]](s32), [[SELECT11]](s32), [[SELECT12]](s32), [[SELECT13]](s32), [[SELECT14]](s32), [[SELECT15]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[BUILD_VECTOR]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<8 x s64>) %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s64) = COPY $vgpr16_vgpr17 %2:_(s32) = COPY $vgpr18 @@ -589,44 +648,49 @@ body: | ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_v_v_last_in_block ; CHECK: bb.0: - ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $vgpr18 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr16_vgpr17 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr18 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr18 ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) - ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) - ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 - ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF - ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF - ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF - ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec - ; CHECK: bb.2: - ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF4]], %bb.0, %23, %bb.2 - ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.2 - ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.2 - ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.2 - ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.2 - ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec - ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec - ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) - ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) - ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) - ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec - ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc - ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec - ; CHECK: bb.3: - ; CHECK: successors: %bb.4(0x80000000) - ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] - ; CHECK: bb.4: - ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: [[UV2:%[0-9]+]]:vgpr(s32), [[UV3:%[0-9]+]]:vgpr(s32), [[UV4:%[0-9]+]]:vgpr(s32), [[UV5:%[0-9]+]]:vgpr(s32), [[UV6:%[0-9]+]]:vgpr(s32), [[UV7:%[0-9]+]]:vgpr(s32), [[UV8:%[0-9]+]]:vgpr(s32), [[UV9:%[0-9]+]]:vgpr(s32), [[UV10:%[0-9]+]]:vgpr(s32), [[UV11:%[0-9]+]]:vgpr(s32), [[UV12:%[0-9]+]]:vgpr(s32), [[UV13:%[0-9]+]]:vgpr(s32), [[UV14:%[0-9]+]]:vgpr(s32), [[UV15:%[0-9]+]]:vgpr(s32), [[UV16:%[0-9]+]]:vgpr(s32), [[UV17:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C]] + ; CHECK: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV]], [[UV2]] + ; CHECK: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP]](s1), [[UV1]], [[UV3]] + ; CHECK: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP1:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]] + ; CHECK: [[SELECT2:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV]], [[UV4]] + ; CHECK: [[SELECT3:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP1]](s1), [[UV1]], [[UV5]] + ; CHECK: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2 + ; CHECK: [[ICMP2:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C2]] + ; CHECK: [[SELECT4:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV]], [[UV6]] + ; CHECK: [[SELECT5:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP2]](s1), [[UV1]], [[UV7]] + ; CHECK: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK: [[ICMP3:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C3]] + ; CHECK: [[SELECT6:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV]], [[UV8]] + ; CHECK: [[SELECT7:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP3]](s1), [[UV1]], [[UV9]] + ; CHECK: [[C4:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4 + ; CHECK: [[ICMP4:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C4]] + ; CHECK: [[SELECT8:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV]], [[UV10]] + ; CHECK: [[SELECT9:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP4]](s1), [[UV1]], [[UV11]] + ; CHECK: [[C5:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK: [[ICMP5:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C5]] + ; CHECK: [[SELECT10:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV]], [[UV12]] + ; CHECK: [[SELECT11:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP5]](s1), [[UV1]], [[UV13]] + ; CHECK: [[C6:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6 + ; CHECK: [[ICMP6:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C6]] + ; CHECK: [[SELECT12:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV]], [[UV14]] + ; CHECK: [[SELECT13:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP6]](s1), [[UV1]], [[UV15]] + ; CHECK: [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 7 + ; CHECK: [[ICMP7:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY2]](s32), [[C7]] + ; CHECK: [[SELECT14:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV]], [[UV16]] + ; CHECK: [[SELECT15:%[0-9]+]]:vgpr(s32) = G_SELECT [[ICMP7]](s1), [[UV1]], [[UV17]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:vgpr(<16 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32), [[SELECT2]](s32), [[SELECT3]](s32), [[SELECT4]](s32), [[SELECT5]](s32), [[SELECT6]](s32), [[SELECT7]](s32), [[SELECT8]](s32), [[SELECT9]](s32), [[SELECT10]](s32), [[SELECT11]](s32), [[SELECT12]](s32), [[SELECT13]](s32), [[SELECT14]](s32), [[SELECT15]](s32) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[BUILD_VECTOR]](<16 x s32>) ; CHECK: bb.1: - ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<8 x s64>) bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $vgpr18