Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -112,6 +112,7 @@ bool selectG_FRAME_INDEX(MachineInstr &I) const; bool selectG_PTR_MASK(MachineInstr &I) const; bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const; + bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; std::pair selectVOP3ModsImpl(Register Src) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1714,6 +1714,75 @@ return true; } +// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd +bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT( + MachineInstr &MI) const { + Register DstReg = MI.getOperand(0).getReg(); + Register VecReg = MI.getOperand(1).getReg(); + Register ValReg = MI.getOperand(2).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); + + LLT VecTy = MRI->getType(DstReg); + LLT ValTy = MRI->getType(ValReg); + unsigned VecSize = VecTy.getSizeInBits(); + unsigned ValSize = ValTy.getSizeInBits(); + + const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI); + const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI); + const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI); + + assert(VecTy.getElementType() == ValTy); + + // The index must be scalar. If it wasn't RegBankSelect should have moved this + // into a waterfall loop. + if (IdxRB->getID() != AMDGPU::SGPRRegBankID) + return false; + + const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB, + *MRI); + const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB, + *MRI); + + if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) || + !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) || + !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) || + !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI)) + return false; + + if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32) + return false; + + const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID && + STI.useVGPRIndexMode(); + + MachineBasicBlock *BB = MI.getParent(); + const DebugLoc &DL = MI.getDebugLoc(); + unsigned SubReg = ValSize == 64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + + if (IndexMode) { + BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON)) + .addReg(IdxReg) + .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE); + } else { + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0) + .addReg(IdxReg); + } + + const MCInstrDesc &RegWriteOp + = TII.getIndirectRegWritePseudo(VecSize, ValSize, + VecRB->getID() == AMDGPU::SGPRRegBankID); + BuildMI(*BB, MI, DL, RegWriteOp, DstReg) + .addReg(VecReg) + .addReg(ValReg) + .addImm(SubReg); + + if (IndexMode) + BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF)); + + MI.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::select(MachineInstr &I) { if (I.isPHI()) return selectPHI(I); @@ -1800,6 +1869,8 @@ return selectG_PTR_MASK(I); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return selectG_EXTRACT_VECTOR_ELT(I); + case TargetOpcode::G_INSERT_VECTOR_ELT: + return selectG_INSERT_VECTOR_ELT(I); default: return selectImpl(I, *CoverageInfo); } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3462,30 +3462,6 @@ return LoopBB; } -static unsigned getIndirectRegWritePseudo(const SIRegisterInfo &TRI, - const TargetRegisterClass *VecRC) { - switch (TRI.getRegSizeInBits(*VecRC)) { - case 32: // 4 bytes - return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1; - case 64: // 8 bytes - return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2; - case 96: // 12 bytes - return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3; - case 128: // 16 bytes - return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4; - case 160: // 20 bytes - return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5; - case 256: // 32 bytes - return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8; - case 512: // 64 bytes - return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16; - case 1024: // 128 bytes - return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32; - default: - llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); - } -} - static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST) { @@ -3525,12 +3501,12 @@ return &MBB; } + const MCInstrDesc &MovRelDesc + = TII->getIndirectRegWritePseudo(TRI.getRegSizeInBits(*VecRC), 4, false); + if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) { MachineBasicBlock::iterator I(&MI); const DebugLoc &DL = MI.getDebugLoc(); - - const MCInstrDesc &MovRelDesc - = TII->get(getIndirectRegWritePseudo(TRI, VecRC)); BuildMI(MBB, I, DL, MovRelDesc, Dst) .addReg(SrcVec->getReg()) .add(*Val) @@ -3553,7 +3529,6 @@ Offset, UseGPRIdxMode, false); MachineBasicBlock *LoopBB = InsPt->getParent(); - const MCInstrDesc &MovRelDesc = TII->get(getIndirectRegWritePseudo(TRI, VecRC)); BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst) .addReg(PhiReg) .add(*Val) Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -242,6 +242,9 @@ // DstRC, then AMDGPU::COPY is returned. unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; + const MCInstrDesc &getIndirectRegWritePseudo( + unsigned VecSize, unsigned EltSize, bool IsSGPR) const; + LLVM_READONLY int commuteOpcode(unsigned Opc) const; Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -982,6 +982,86 @@ return AMDGPU::COPY; } +static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) { + switch (VecSize) { + case 32: // 4 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1; + case 64: // 8 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2; + case 96: // 12 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3; + case 128: // 16 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4; + case 160: // 20 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5; + case 256: // 32 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8; + case 512: // 64 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16; + case 1024: // 128 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32; + default: + llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); + } +} + +static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) { + switch (VecSize) { + case 32: // 4 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1; + case 64: // 8 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2; + case 96: // 12 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3; + case 128: // 16 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4; + case 160: // 20 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5; + case 256: // 32 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8; + case 512: // 64 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16; + case 1024: // 128 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32; + default: + llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); + } +} + +static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) { + switch (VecSize) { + case 64: // 8 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1; + case 128: // 16 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2; + case 256: // 32 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4; + case 512: // 64 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8; + case 1024: // 128 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16; + default: + llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); + } +} + +const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo( + unsigned VecSize, unsigned EltSize, bool IsSGPR) const { + if (IsSGPR) { + switch (EltSize) { + case 32: + return get(getIndirectSGPRWritePseudo32(VecSize)); + case 64: + return get(getIndirectSGPRWritePseudo64(VecSize)); + default: + llvm_unreachable("invalid reg indexing elt size"); + } + } + + assert(EltSize == 32 && "invalid reg indexing elt size"); + return get(getIndirectVGPRWritePseudoOpc(VecSize)); +} + static unsigned getSGPRSpillSaveOpcode(unsigned Size) { switch (Size) { case 4: @@ -1487,9 +1567,31 @@ case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5: case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8: case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16: - case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32: { - unsigned Opc = ST.useVGPRIndexMode() ? - AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32; + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: { + const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); + + unsigned Opc; + if (RI.hasVGPRs(EltRC)) { + Opc = ST.useVGPRIndexMode() ? + AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32; + } else { + Opc = RI.getRegSizeInBits(*EltRC) == 64 ? + AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32; + } + const MCInstrDesc &OpDesc = get(Opc); Register VecReg = MI.getOperand(0).getReg(); bool IsUndef = MI.getOperand(1).isUndef(); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -0,0 +1,2221 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GPRIDX %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=MOVREL %s +; RUN: not llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s + +; FIXME: Need constant bus fixup pre-gfx10 for movrel +; ERR: Bad machine code: VOP* instruction violates constant bus restriction + +define amdgpu_ps <8 x i32> @dyn_insertelement_v8i32_s_s_s(<8 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8i32_s_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 m0, s11 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movreld_b32 s0, s10 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8i32_s_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 m0, s11 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_movreld_b32 s0, s10 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x i32> %vec, i32 %val, i32 %idx + ret <8 x i32> %insert +} + +define amdgpu_ps <8 x i8 addrspace(3)*> @dyn_insertelement_v8p3i8_s_s_s(<8 x i8 addrspace(3)*> inreg %vec, i8 addrspace(3)* inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8p3i8_s_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 m0, s11 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movreld_b32 s0, s10 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8p3i8_s_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 m0, s11 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_movreld_b32 s0, s10 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x i8 addrspace(3)*> %vec, i8 addrspace(3)* %val, i32 %idx + ret <8 x i8 addrspace(3)*> %insert +} + +define <8 x float> @dyn_insertelement_v8f32_const_s_v_v(float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_const_s_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GPRIDX-NEXT: s_mov_b32 s11, 0x41000000 +; GPRIDX-NEXT: s_mov_b32 s4, 1.0 +; GPRIDX-NEXT: s_mov_b32 s5, 2.0 +; GPRIDX-NEXT: s_mov_b32 s6, 0x40400000 +; GPRIDX-NEXT: s_mov_b32 s7, 4.0 +; GPRIDX-NEXT: s_mov_b32 s8, 0x40a00000 +; GPRIDX-NEXT: s_mov_b32 s9, 0x40c00000 +; GPRIDX-NEXT: s_mov_b32 s10, 0x40e00000 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s4 +; GPRIDX-NEXT: s_mov_b64 s[4:5], exec +; GPRIDX-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v1 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 +; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v2, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v8, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v9, v17 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v0 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB2_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v9 +; GPRIDX-NEXT: s_setpc_b64 s[30:31] +; +; MOVREL-LABEL: dyn_insertelement_v8f32_const_s_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 +; MOVREL-NEXT: s_mov_b32 s11, 0x41000000 +; MOVREL-NEXT: s_mov_b32 s4, 1.0 +; MOVREL-NEXT: s_mov_b32 s5, 2.0 +; MOVREL-NEXT: s_mov_b32 s6, 0x40400000 +; MOVREL-NEXT: s_mov_b32 s7, 4.0 +; MOVREL-NEXT: s_mov_b32 s8, 0x40a00000 +; MOVREL-NEXT: s_mov_b32 s9, 0x40c00000 +; MOVREL-NEXT: s_mov_b32 s10, 0x40e00000 +; MOVREL-NEXT: v_mov_b32_e32 v17, s11 +; MOVREL-NEXT: v_mov_b32_e32 v13, s7 +; MOVREL-NEXT: v_mov_b32_e32 v14, s8 +; MOVREL-NEXT: v_mov_b32_e32 v15, s9 +; MOVREL-NEXT: v_mov_b32_e32 v16, s10 +; MOVREL-NEXT: v_mov_b32_e32 v12, s6 +; MOVREL-NEXT: v_mov_b32_e32 v11, s5 +; MOVREL-NEXT: v_mov_b32_e32 v10, s4 +; MOVREL-NEXT: s_mov_b32 s4, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB2_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s5, v1 +; MOVREL-NEXT: v_mov_b32_e32 v2, v10 +; MOVREL-NEXT: v_mov_b32_e32 v3, v11 +; MOVREL-NEXT: v_mov_b32_e32 v4, v12 +; MOVREL-NEXT: v_mov_b32_e32 v5, v13 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v1 +; MOVREL-NEXT: s_mov_b32 m0, s5 +; MOVREL-NEXT: v_mov_b32_e32 v6, v14 +; MOVREL-NEXT: v_mov_b32_e32 v7, v15 +; MOVREL-NEXT: v_mov_b32_e32 v8, v16 +; MOVREL-NEXT: v_mov_b32_e32 v9, v17 +; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB2_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s4 +; MOVREL-NEXT: v_mov_b32_e32 v0, v2 +; MOVREL-NEXT: v_mov_b32_e32 v1, v3 +; MOVREL-NEXT: v_mov_b32_e32 v2, v4 +; MOVREL-NEXT: v_mov_b32_e32 v3, v5 +; MOVREL-NEXT: v_mov_b32_e32 v4, v6 +; MOVREL-NEXT: v_mov_b32_e32 v5, v7 +; MOVREL-NEXT: v_mov_b32_e32 v6, v8 +; MOVREL-NEXT: v_mov_b32_e32 v7, v9 +; MOVREL-NEXT: s_setpc_b64 s[30:31] +entry: + %insert = insertelement <8 x float> , float %val, i32 %idx + ret <8 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_v(<8 x float> inreg %vec, float inreg %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_s_s_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s0 +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v8 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v8 +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s10 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB3_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: v_mov_b32_e32 v16, s7 +; MOVREL-NEXT: v_mov_b32_e32 v8, v0 +; MOVREL-NEXT: v_mov_b32_e32 v14, s5 +; MOVREL-NEXT: v_mov_b32_e32 v13, s4 +; MOVREL-NEXT: v_mov_b32_e32 v15, s6 +; MOVREL-NEXT: v_mov_b32_e32 v12, s3 +; MOVREL-NEXT: v_mov_b32_e32 v11, s2 +; MOVREL-NEXT: v_mov_b32_e32 v10, s1 +; MOVREL-NEXT: v_mov_b32_e32 v9, s0 +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v8 +; MOVREL-NEXT: v_mov_b32_e32 v0, v9 +; MOVREL-NEXT: v_mov_b32_e32 v1, v10 +; MOVREL-NEXT: v_mov_b32_e32 v2, v11 +; MOVREL-NEXT: v_mov_b32_e32 v3, v12 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v8 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_mov_b32_e32 v4, v13 +; MOVREL-NEXT: v_mov_b32_e32 v5, v14 +; MOVREL-NEXT: v_mov_b32_e32 v6, v15 +; MOVREL-NEXT: v_mov_b32_e32 v7, v16 +; MOVREL-NEXT: v_movreld_b32_e32 v0, s10 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB3_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x float> %vec, float %val, i32 %idx + ret <8 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_s_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s7 +; GPRIDX-NEXT: s_set_gpr_idx_on s10, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_s_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: v_mov_b32_e32 v8, v0 +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; MOVREL-NEXT: s_mov_b32 m0, s10 +; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: v_mov_b32_e32 v2, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s3 +; MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; MOVREL-NEXT: v_mov_b32_e32 v6, s6 +; MOVREL-NEXT: v_mov_b32_e32 v7, s7 +; MOVREL-NEXT: v_movreld_b32_e32 v0, v8 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x float> %vec, float %val, i32 %idx + ret <8 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_s_s(<8 x float> %vec, float inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_v_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_v_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 m0, s3 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_movreld_b32_e32 v0, s2 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x float> %vec, float %val, i32 %idx + ret <8 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_v(<8 x float> inreg %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_s_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v9, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s0 +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 +; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB6_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_s_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: v_mov_b32_e32 v17, s7 +; MOVREL-NEXT: v_mov_b32_e32 v8, v0 +; MOVREL-NEXT: v_mov_b32_e32 v9, v1 +; MOVREL-NEXT: v_mov_b32_e32 v15, s5 +; MOVREL-NEXT: v_mov_b32_e32 v16, s6 +; MOVREL-NEXT: v_mov_b32_e32 v14, s4 +; MOVREL-NEXT: v_mov_b32_e32 v13, s3 +; MOVREL-NEXT: v_mov_b32_e32 v12, s2 +; MOVREL-NEXT: v_mov_b32_e32 v11, s1 +; MOVREL-NEXT: v_mov_b32_e32 v10, s0 +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v9 +; MOVREL-NEXT: v_mov_b32_e32 v0, v10 +; MOVREL-NEXT: v_mov_b32_e32 v1, v11 +; MOVREL-NEXT: v_mov_b32_e32 v2, v12 +; MOVREL-NEXT: v_mov_b32_e32 v3, v13 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_mov_b32_e32 v4, v14 +; MOVREL-NEXT: v_mov_b32_e32 v5, v15 +; MOVREL-NEXT: v_mov_b32_e32 v6, v16 +; MOVREL-NEXT: v_mov_b32_e32 v7, v17 +; MOVREL-NEXT: v_movreld_b32_e32 v0, v8 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB6_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x float> %vec, float %val, i32 %idx + ret <8 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_s_v(<8 x float> %vec, float inreg %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_v_s_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s3, v8 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s3, v8 +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v16, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v15, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v14, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v13, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v12, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v10, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s2 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB7_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v16 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_v_s_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v8 +; MOVREL-NEXT: v_mov_b32_e32 v16, v7 +; MOVREL-NEXT: v_mov_b32_e32 v9, v0 +; MOVREL-NEXT: v_mov_b32_e32 v15, v6 +; MOVREL-NEXT: v_mov_b32_e32 v14, v5 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v8 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_mov_b32_e32 v13, v4 +; MOVREL-NEXT: v_mov_b32_e32 v12, v3 +; MOVREL-NEXT: v_mov_b32_e32 v11, v2 +; MOVREL-NEXT: v_mov_b32_e32 v10, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v9, s2 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB7_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: v_mov_b32_e32 v0, v9 +; MOVREL-NEXT: v_mov_b32_e32 v1, v10 +; MOVREL-NEXT: v_mov_b32_e32 v2, v11 +; MOVREL-NEXT: v_mov_b32_e32 v3, v12 +; MOVREL-NEXT: v_mov_b32_e32 v4, v13 +; MOVREL-NEXT: v_mov_b32_e32 v5, v14 +; MOVREL-NEXT: v_mov_b32_e32 v6, v15 +; MOVREL-NEXT: v_mov_b32_e32 v7, v16 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x float> %vec, float %val, i32 %idx + ret <8 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_s(<8 x float> %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_movreld_b32_e32 v0, v8 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x float> %vec, float %val, i32 %idx + ret <8 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8p3i8_v_v_s(<8 x i8 addrspace(3)*> %vec, i8 addrspace(3)* %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8p3i8_v_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v8 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8p3i8_v_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_movreld_b32_e32 v0, v8 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x i8 addrspace(3)*> %vec, i8 addrspace(3)* %val, i32 %idx + %cast.0 = ptrtoint <8 x i8 addrspace(3)*> %insert to <8 x i32> + %cast.1 = bitcast <8 x i32> %cast.0 to <8 x float> + ret <8 x float> %cast.1 +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v(<8 x float> %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB10_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v9 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v9 +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v17, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v16, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v15, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v14, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v13, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v12, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v10, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v10, v8 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB10_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v17 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB10_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v9 +; MOVREL-NEXT: v_mov_b32_e32 v17, v7 +; MOVREL-NEXT: v_mov_b32_e32 v10, v0 +; MOVREL-NEXT: v_mov_b32_e32 v16, v6 +; MOVREL-NEXT: v_mov_b32_e32 v15, v5 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_mov_b32_e32 v14, v4 +; MOVREL-NEXT: v_mov_b32_e32 v13, v3 +; MOVREL-NEXT: v_mov_b32_e32 v12, v2 +; MOVREL-NEXT: v_mov_b32_e32 v11, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v10, v8 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB10_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: v_mov_b32_e32 v0, v10 +; MOVREL-NEXT: v_mov_b32_e32 v1, v11 +; MOVREL-NEXT: v_mov_b32_e32 v2, v12 +; MOVREL-NEXT: v_mov_b32_e32 v3, v13 +; MOVREL-NEXT: v_mov_b32_e32 v4, v14 +; MOVREL-NEXT: v_mov_b32_e32 v5, v15 +; MOVREL-NEXT: v_mov_b32_e32 v6, v16 +; MOVREL-NEXT: v_mov_b32_e32 v7, v17 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x float> %vec, float %val, i32 %idx + ret <8 x float> %insert +} + +define amdgpu_ps <8 x i64> @dyn_insertelement_v8i64_s_s_s(<8 x i64> inreg %vec, i64 inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8i64_s_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 m0, s20 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movreld_b64 s[0:1], s[18:19] +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8i64_s_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 m0, s20 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 +; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s11, s13 +; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s13, s15 +; MOVREL-NEXT: s_mov_b32 s14, s16 +; MOVREL-NEXT: s_mov_b32 s15, s17 +; MOVREL-NEXT: s_movreld_b64 s[0:1], s[18:19] +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x i64> %vec, i64 %val, i32 %idx + ret <8 x i64> %insert +} + +define amdgpu_ps <8 x i8 addrspace(1)*> @dyn_insertelement_v8p1i8_s_s_s(<8 x i8 addrspace(1)*> inreg %vec, i8 addrspace(1)* inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8p1i8_s_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 m0, s20 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movreld_b64 s[0:1], s[18:19] +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8p1i8_s_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 m0, s20 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 +; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s11, s13 +; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s13, s15 +; MOVREL-NEXT: s_mov_b32 s14, s16 +; MOVREL-NEXT: s_mov_b32 s15, s17 +; MOVREL-NEXT: s_movreld_b64 s[0:1], s[18:19] +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <8 x i8 addrspace(1)*> %vec, i8 addrspace(1)* %val, i32 %idx + ret <8 x i8 addrspace(1)*> %insert +} + +define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f64_const_s_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GPRIDX-NEXT: s_mov_b32 s8, 0 +; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0 +; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 +; GPRIDX-NEXT: s_mov_b32 s9, 0x40080000 +; GPRIDX-NEXT: s_mov_b64 s[10:11], 4.0 +; GPRIDX-NEXT: s_mov_b32 s13, 0x40140000 +; GPRIDX-NEXT: s_mov_b32 s12, s8 +; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000 +; GPRIDX-NEXT: s_mov_b32 s14, s8 +; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 +; GPRIDX-NEXT: s_mov_b32 s16, s8 +; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 +; GPRIDX-NEXT: s_mov_b32 s18, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 +; GPRIDX-NEXT: s_mov_b64 s[4:5], exec +; GPRIDX-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GPRIDX-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GPRIDX-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill +; GPRIDX-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s6, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s6, v2 +; GPRIDX-NEXT: s_lshl_b32 s6, s6, 1 +; GPRIDX-NEXT: s_add_u32 s7, s6, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s6, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v34, v18 +; GPRIDX-NEXT: v_mov_b32_e32 v33, v17 +; GPRIDX-NEXT: v_mov_b32_e32 v32, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v31, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v30, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v29, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v28, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v27, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v26, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v25, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v24, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v23, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v22, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v21, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v0 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_set_gpr_idx_on s7, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v19, v1 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB13_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[4:5] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; GPRIDX-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; GPRIDX-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GPRIDX-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GPRIDX-NEXT: s_waitcnt vmcnt(0) +; GPRIDX-NEXT: s_setpc_b64 s[30:31] +; +; MOVREL-LABEL: dyn_insertelement_v8f64_const_s_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 +; MOVREL-NEXT: s_mov_b32 s8, 0 +; MOVREL-NEXT: s_mov_b64 s[4:5], 1.0 +; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0 +; MOVREL-NEXT: s_mov_b32 s9, 0x40080000 +; MOVREL-NEXT: s_mov_b64 s[10:11], 4.0 +; MOVREL-NEXT: s_mov_b32 s13, 0x40140000 +; MOVREL-NEXT: s_mov_b32 s12, s8 +; MOVREL-NEXT: s_mov_b32 s15, 0x40180000 +; MOVREL-NEXT: s_mov_b32 s14, s8 +; MOVREL-NEXT: s_mov_b32 s17, 0x401c0000 +; MOVREL-NEXT: s_mov_b32 s16, s8 +; MOVREL-NEXT: s_mov_b32 s19, 0x40200000 +; MOVREL-NEXT: s_mov_b32 s18, s8 +; MOVREL-NEXT: v_mov_b32_e32 v3, s4 +; MOVREL-NEXT: v_mov_b32_e32 v4, s5 +; MOVREL-NEXT: v_mov_b32_e32 v5, s6 +; MOVREL-NEXT: v_mov_b32_e32 v6, s7 +; MOVREL-NEXT: v_mov_b32_e32 v7, s8 +; MOVREL-NEXT: v_mov_b32_e32 v8, s9 +; MOVREL-NEXT: v_mov_b32_e32 v9, s10 +; MOVREL-NEXT: v_mov_b32_e32 v10, s11 +; MOVREL-NEXT: v_mov_b32_e32 v11, s12 +; MOVREL-NEXT: v_mov_b32_e32 v12, s13 +; MOVREL-NEXT: v_mov_b32_e32 v13, s14 +; MOVREL-NEXT: v_mov_b32_e32 v14, s15 +; MOVREL-NEXT: v_mov_b32_e32 v15, s16 +; MOVREL-NEXT: v_mov_b32_e32 v16, s17 +; MOVREL-NEXT: v_mov_b32_e32 v17, s18 +; MOVREL-NEXT: v_mov_b32_e32 v18, s19 +; MOVREL-NEXT: s_mov_b32 s4, exec_lo +; MOVREL-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; MOVREL-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; MOVREL-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB13_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s5, v2 +; MOVREL-NEXT: v_mov_b32_e32 v34, v18 +; MOVREL-NEXT: v_mov_b32_e32 v19, v3 +; MOVREL-NEXT: v_mov_b32_e32 v33, v17 +; MOVREL-NEXT: v_mov_b32_e32 v32, v16 +; MOVREL-NEXT: s_lshl_b32 s6, s5, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v2 +; MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; MOVREL-NEXT: v_mov_b32_e32 v30, v14 +; MOVREL-NEXT: v_mov_b32_e32 v29, v13 +; MOVREL-NEXT: s_add_u32 s5, s6, 1 +; MOVREL-NEXT: s_mov_b32 m0, s6 +; MOVREL-NEXT: v_mov_b32_e32 v28, v12 +; MOVREL-NEXT: v_mov_b32_e32 v27, v11 +; MOVREL-NEXT: v_mov_b32_e32 v26, v10 +; MOVREL-NEXT: v_mov_b32_e32 v25, v9 +; MOVREL-NEXT: v_mov_b32_e32 v24, v8 +; MOVREL-NEXT: v_mov_b32_e32 v23, v7 +; MOVREL-NEXT: v_mov_b32_e32 v22, v6 +; MOVREL-NEXT: v_mov_b32_e32 v21, v5 +; MOVREL-NEXT: v_mov_b32_e32 v20, v4 +; MOVREL-NEXT: v_movreld_b32_e32 v19, v0 +; MOVREL-NEXT: s_mov_b32 m0, s5 +; MOVREL-NEXT: v_movreld_b32_e32 v19, v1 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB13_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s4 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; MOVREL-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload +; MOVREL-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; MOVREL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; MOVREL-NEXT: s_waitcnt vmcnt(0) +; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 +; MOVREL-NEXT: s_setpc_b64 s[30:31] +entry: + %insert = insertelement <8 x double> , double %val, i32 %idx + %vec.0 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.1 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.2 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.3 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + store volatile <2 x double> %vec.0, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.1, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.2, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef + ret void +} + +define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, double inreg %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f64_s_s_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s0 +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB14_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v0 +; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 +; GPRIDX-NEXT: s_add_u32 s3, s2, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v32, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v31, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v30, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v29, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v28, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v27, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v26, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v25, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v24, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v23, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v22, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v21, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v18, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v17, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v17, s19 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB14_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[17:20], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[21:24], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[25:28], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[29:32], off +; GPRIDX-NEXT: s_endpgm +; +; MOVREL-LABEL: dyn_insertelement_v8f64_s_s_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s9, s11 +; MOVREL-NEXT: s_mov_b32 s11, s13 +; MOVREL-NEXT: s_mov_b32 s13, s15 +; MOVREL-NEXT: s_mov_b32 s15, s17 +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s14, s16 +; MOVREL-NEXT: v_mov_b32_e32 v16, s15 +; MOVREL-NEXT: v_mov_b32_e32 v14, s13 +; MOVREL-NEXT: v_mov_b32_e32 v12, s11 +; MOVREL-NEXT: v_mov_b32_e32 v13, s12 +; MOVREL-NEXT: v_mov_b32_e32 v15, s14 +; MOVREL-NEXT: v_mov_b32_e32 v11, s10 +; MOVREL-NEXT: v_mov_b32_e32 v10, s9 +; MOVREL-NEXT: v_mov_b32_e32 v9, s8 +; MOVREL-NEXT: v_mov_b32_e32 v8, s7 +; MOVREL-NEXT: v_mov_b32_e32 v7, s6 +; MOVREL-NEXT: v_mov_b32_e32 v6, s5 +; MOVREL-NEXT: v_mov_b32_e32 v5, s4 +; MOVREL-NEXT: v_mov_b32_e32 v4, s3 +; MOVREL-NEXT: v_mov_b32_e32 v3, s2 +; MOVREL-NEXT: v_mov_b32_e32 v2, s1 +; MOVREL-NEXT: v_mov_b32_e32 v1, s0 +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB14_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v0 +; MOVREL-NEXT: v_mov_b32_e32 v32, v16 +; MOVREL-NEXT: v_mov_b32_e32 v17, v1 +; MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; MOVREL-NEXT: v_mov_b32_e32 v30, v14 +; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0 +; MOVREL-NEXT: v_mov_b32_e32 v29, v13 +; MOVREL-NEXT: v_mov_b32_e32 v28, v12 +; MOVREL-NEXT: v_mov_b32_e32 v27, v11 +; MOVREL-NEXT: s_add_u32 s1, s2, 1 +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_mov_b32_e32 v26, v10 +; MOVREL-NEXT: v_mov_b32_e32 v25, v9 +; MOVREL-NEXT: v_mov_b32_e32 v24, v8 +; MOVREL-NEXT: v_mov_b32_e32 v23, v7 +; MOVREL-NEXT: v_mov_b32_e32 v22, v6 +; MOVREL-NEXT: v_mov_b32_e32 v21, v5 +; MOVREL-NEXT: v_mov_b32_e32 v20, v4 +; MOVREL-NEXT: v_mov_b32_e32 v19, v3 +; MOVREL-NEXT: v_mov_b32_e32 v18, v2 +; MOVREL-NEXT: v_movreld_b32_e32 v17, s18 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_movreld_b32_e32 v17, s19 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB14_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[17:20], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[21:24], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[25:28], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[29:32], off +; MOVREL-NEXT: s_endpgm +entry: + %insert = insertelement <8 x double> %vec, double %val, i32 %idx + %vec.0 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.1 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.2 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.3 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + store volatile <2 x double> %vec.0, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.1, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.2, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef + ret void +} + +define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, double %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s0 +; GPRIDX-NEXT: s_lshl_b32 s0, s18, 1 +; GPRIDX-NEXT: s_add_u32 s1, s0, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v2, v0 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v2, v1 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[14:17], off +; GPRIDX-NEXT: s_endpgm +; +; MOVREL-LABEL: dyn_insertelement_v8f64_s_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s9, s11 +; MOVREL-NEXT: s_mov_b32 s11, s13 +; MOVREL-NEXT: s_mov_b32 s13, s15 +; MOVREL-NEXT: s_mov_b32 s15, s17 +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s14, s16 +; MOVREL-NEXT: s_lshl_b32 s16, s18, 1 +; MOVREL-NEXT: v_mov_b32_e32 v17, s15 +; MOVREL-NEXT: v_mov_b32_e32 v15, s13 +; MOVREL-NEXT: v_mov_b32_e32 v14, s12 +; MOVREL-NEXT: v_mov_b32_e32 v16, s14 +; MOVREL-NEXT: s_mov_b32 m0, s16 +; MOVREL-NEXT: v_mov_b32_e32 v13, s11 +; MOVREL-NEXT: v_mov_b32_e32 v12, s10 +; MOVREL-NEXT: v_mov_b32_e32 v11, s9 +; MOVREL-NEXT: v_mov_b32_e32 v10, s8 +; MOVREL-NEXT: v_mov_b32_e32 v9, s7 +; MOVREL-NEXT: v_mov_b32_e32 v8, s6 +; MOVREL-NEXT: v_mov_b32_e32 v7, s5 +; MOVREL-NEXT: v_mov_b32_e32 v6, s4 +; MOVREL-NEXT: v_mov_b32_e32 v5, s3 +; MOVREL-NEXT: v_mov_b32_e32 v4, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 +; MOVREL-NEXT: s_add_u32 s0, s16, 1 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_movreld_b32_e32 v2, v0 +; MOVREL-NEXT: s_mov_b32 m0, s0 +; MOVREL-NEXT: v_movreld_b32_e32 v2, v1 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[10:13], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[14:17], off +; MOVREL-NEXT: s_endpgm +entry: + %insert = insertelement <8 x double> %vec, double %val, i32 %idx + %vec.0 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.1 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.2 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.3 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + store volatile <2 x double> %vec.0, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.1, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.2, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef + ret void +} + +define amdgpu_ps void @dyn_insertelement_v8f64_v_s_s(<8 x double> %vec, double inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f64_v_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_lshl_b32 s0, s4, 1 +; GPRIDX-NEXT: s_add_u32 s1, s0, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, s2 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, s3 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GPRIDX-NEXT: s_endpgm +; +; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_lshl_b32 s0, s4, 1 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_mov_b32 m0, s0 +; MOVREL-NEXT: s_add_u32 s0, s0, 1 +; MOVREL-NEXT: v_movreld_b32_e32 v0, s2 +; MOVREL-NEXT: s_mov_b32 m0, s0 +; MOVREL-NEXT: v_movreld_b32_e32 v0, s3 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; MOVREL-NEXT: s_endpgm +entry: + %insert = insertelement <8 x double> %vec, double %val, i32 %idx + %vec.0 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.1 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.2 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.3 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + store volatile <2 x double> %vec.0, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.1, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.2, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef + ret void +} + +define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, double %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB17_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 +; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 +; GPRIDX-NEXT: s_add_u32 s3, s2, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v34, v18 +; GPRIDX-NEXT: v_mov_b32_e32 v33, v17 +; GPRIDX-NEXT: v_mov_b32_e32 v32, v16 +; GPRIDX-NEXT: v_mov_b32_e32 v31, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v30, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v29, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v28, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v27, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v26, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v25, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v24, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v23, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v22, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v21, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v0 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v19, v1 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB17_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; GPRIDX-NEXT: s_endpgm +; +; MOVREL-LABEL: dyn_insertelement_v8f64_s_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s9, s11 +; MOVREL-NEXT: s_mov_b32 s11, s13 +; MOVREL-NEXT: s_mov_b32 s13, s15 +; MOVREL-NEXT: s_mov_b32 s15, s17 +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s14, s16 +; MOVREL-NEXT: v_mov_b32_e32 v18, s15 +; MOVREL-NEXT: v_mov_b32_e32 v16, s13 +; MOVREL-NEXT: v_mov_b32_e32 v14, s11 +; MOVREL-NEXT: v_mov_b32_e32 v15, s12 +; MOVREL-NEXT: v_mov_b32_e32 v17, s14 +; MOVREL-NEXT: v_mov_b32_e32 v13, s10 +; MOVREL-NEXT: v_mov_b32_e32 v12, s9 +; MOVREL-NEXT: v_mov_b32_e32 v11, s8 +; MOVREL-NEXT: v_mov_b32_e32 v10, s7 +; MOVREL-NEXT: v_mov_b32_e32 v9, s6 +; MOVREL-NEXT: v_mov_b32_e32 v8, s5 +; MOVREL-NEXT: v_mov_b32_e32 v7, s4 +; MOVREL-NEXT: v_mov_b32_e32 v6, s3 +; MOVREL-NEXT: v_mov_b32_e32 v5, s2 +; MOVREL-NEXT: v_mov_b32_e32 v4, s1 +; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB17_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v2 +; MOVREL-NEXT: v_mov_b32_e32 v34, v18 +; MOVREL-NEXT: v_mov_b32_e32 v19, v3 +; MOVREL-NEXT: v_mov_b32_e32 v33, v17 +; MOVREL-NEXT: v_mov_b32_e32 v32, v16 +; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v2 +; MOVREL-NEXT: v_mov_b32_e32 v31, v15 +; MOVREL-NEXT: v_mov_b32_e32 v30, v14 +; MOVREL-NEXT: v_mov_b32_e32 v29, v13 +; MOVREL-NEXT: s_add_u32 s1, s2, 1 +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_mov_b32_e32 v28, v12 +; MOVREL-NEXT: v_mov_b32_e32 v27, v11 +; MOVREL-NEXT: v_mov_b32_e32 v26, v10 +; MOVREL-NEXT: v_mov_b32_e32 v25, v9 +; MOVREL-NEXT: v_mov_b32_e32 v24, v8 +; MOVREL-NEXT: v_mov_b32_e32 v23, v7 +; MOVREL-NEXT: v_mov_b32_e32 v22, v6 +; MOVREL-NEXT: v_mov_b32_e32 v21, v5 +; MOVREL-NEXT: v_mov_b32_e32 v20, v4 +; MOVREL-NEXT: v_movreld_b32_e32 v19, v0 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_movreld_b32_e32 v19, v1 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB17_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; MOVREL-NEXT: s_endpgm +entry: + %insert = insertelement <8 x double> %vec, double %val, i32 %idx + %vec.0 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.1 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.2 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.3 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + store volatile <2 x double> %vec.0, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.1, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.2, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef + ret void +} + +define amdgpu_ps void @dyn_insertelement_v8f64_v_s_v(<8 x double> %vec, double inreg %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f64_v_s_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB18_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s4, v16 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 +; GPRIDX-NEXT: s_lshl_b32 s4, s4, 1 +; GPRIDX-NEXT: s_add_u32 s5, s4, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v32, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v31, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v30, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v29, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v28, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v27, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v26, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v25, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v24, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v23, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v22, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v21, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v18, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v17, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s2 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_set_gpr_idx_on s5, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v17, s3 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB18_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[17:20], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[21:24], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[25:28], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[29:32], off +; GPRIDX-NEXT: s_endpgm +; +; MOVREL-LABEL: dyn_insertelement_v8f64_v_s_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB18_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v16 +; MOVREL-NEXT: v_mov_b32_e32 v32, v15 +; MOVREL-NEXT: v_mov_b32_e32 v17, v0 +; MOVREL-NEXT: v_mov_b32_e32 v31, v14 +; MOVREL-NEXT: v_mov_b32_e32 v30, v13 +; MOVREL-NEXT: s_lshl_b32 s4, s1, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v16 +; MOVREL-NEXT: v_mov_b32_e32 v29, v12 +; MOVREL-NEXT: v_mov_b32_e32 v28, v11 +; MOVREL-NEXT: v_mov_b32_e32 v27, v10 +; MOVREL-NEXT: s_add_u32 s1, s4, 1 +; MOVREL-NEXT: s_mov_b32 m0, s4 +; MOVREL-NEXT: v_mov_b32_e32 v26, v9 +; MOVREL-NEXT: v_mov_b32_e32 v25, v8 +; MOVREL-NEXT: v_mov_b32_e32 v24, v7 +; MOVREL-NEXT: v_mov_b32_e32 v23, v6 +; MOVREL-NEXT: v_mov_b32_e32 v22, v5 +; MOVREL-NEXT: v_mov_b32_e32 v21, v4 +; MOVREL-NEXT: v_mov_b32_e32 v20, v3 +; MOVREL-NEXT: v_mov_b32_e32 v19, v2 +; MOVREL-NEXT: v_mov_b32_e32 v18, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v17, s2 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_movreld_b32_e32 v17, s3 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB18_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[17:20], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[21:24], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[25:28], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[29:32], off +; MOVREL-NEXT: s_endpgm +entry: + %insert = insertelement <8 x double> %vec, double %val, i32 %idx + %vec.0 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.1 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.2 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.3 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + store volatile <2 x double> %vec.0, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.1, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.2, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef + ret void +} + +define amdgpu_ps void @dyn_insertelement_v8f64_v_v_s(<8 x double> %vec, double %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_lshl_b32 s0, s2, 1 +; GPRIDX-NEXT: s_add_u32 s1, s0, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v16 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_set_gpr_idx_on s1, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v17 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; GPRIDX-NEXT: s_endpgm +; +; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_lshl_b32 s0, s2, 1 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_mov_b32 m0, s0 +; MOVREL-NEXT: s_add_u32 s0, s0, 1 +; MOVREL-NEXT: v_movreld_b32_e32 v0, v16 +; MOVREL-NEXT: s_mov_b32 m0, s0 +; MOVREL-NEXT: v_movreld_b32_e32 v0, v17 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; MOVREL-NEXT: s_endpgm +entry: + %insert = insertelement <8 x double> %vec, double %val, i32 %idx + %vec.0 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.1 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.2 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.3 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + store volatile <2 x double> %vec.0, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.1, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.2, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef + ret void +} + +define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v(<8 x double> %vec, double %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_v: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB20_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18 +; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 +; GPRIDX-NEXT: s_add_u32 s3, s2, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v34, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v33, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v32, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v31, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v30, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v29, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v28, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v27, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v26, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v25, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v24, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v23, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v22, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v21, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v16 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v19, v17 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB20_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; GPRIDX-NEXT: s_endpgm +; +; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB20_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v18 +; MOVREL-NEXT: v_mov_b32_e32 v34, v15 +; MOVREL-NEXT: v_mov_b32_e32 v19, v0 +; MOVREL-NEXT: v_mov_b32_e32 v33, v14 +; MOVREL-NEXT: v_mov_b32_e32 v32, v13 +; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v18 +; MOVREL-NEXT: v_mov_b32_e32 v31, v12 +; MOVREL-NEXT: v_mov_b32_e32 v30, v11 +; MOVREL-NEXT: v_mov_b32_e32 v29, v10 +; MOVREL-NEXT: s_add_u32 s1, s2, 1 +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_mov_b32_e32 v28, v9 +; MOVREL-NEXT: v_mov_b32_e32 v27, v8 +; MOVREL-NEXT: v_mov_b32_e32 v26, v7 +; MOVREL-NEXT: v_mov_b32_e32 v25, v6 +; MOVREL-NEXT: v_mov_b32_e32 v24, v5 +; MOVREL-NEXT: v_mov_b32_e32 v23, v4 +; MOVREL-NEXT: v_mov_b32_e32 v22, v3 +; MOVREL-NEXT: v_mov_b32_e32 v21, v2 +; MOVREL-NEXT: v_mov_b32_e32 v20, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v19, v16 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_movreld_b32_e32 v19, v17 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB20_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; MOVREL-NEXT: s_endpgm +entry: + %insert = insertelement <8 x double> %vec, double %val, i32 %idx + %vec.0 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.1 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.2 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.3 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + store volatile <2 x double> %vec.0, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.1, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.2, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef + ret void +} + +define amdgpu_ps <3 x i32> @dyn_insertelement_v3i32_s_s_s(<3 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v3i32_s_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 m0, s6 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movreld_b32 s0, s5 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v3i32_s_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 m0, s6 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: s_movreld_b32 s0, s5 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <3 x i32> %vec, i32 %val, i32 %idx + ret <3 x i32> %insert +} + +define amdgpu_ps <3 x float> @dyn_insertelement_v3i32_v_v_s(<3 x float> %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v3i32_v_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v3 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v3i32_v_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_movreld_b32_e32 v0, v3 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <3 x float> %vec, float %val, i32 %idx + ret <3 x float> %insert +} + +define amdgpu_ps <5 x i32> @dyn_insertelement_v5i32_s_s_s(<5 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v5i32_s_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 m0, s8 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movreld_b32 s0, s7 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v5i32_s_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 m0, s8 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_movreld_b32 s0, s7 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <5 x i32> %vec, i32 %val, i32 %idx + ret <5 x i32> %insert +} + +define amdgpu_ps <5 x float> @dyn_insertelement_v5i32_v_v_s(<5 x float> %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v5i32_v_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v5 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v5i32_v_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_movreld_b32_e32 v0, v5 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <5 x float> %vec, float %val, i32 %idx + ret <5 x float> %insert +} + +define amdgpu_ps <32 x i32> @dyn_insertelement_v32i32_s_s_s(<32 x i32> inreg %vec, i32 inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v32i32_s_s_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 s16, s18 +; GPRIDX-NEXT: s_mov_b32 s17, s19 +; GPRIDX-NEXT: s_mov_b32 s18, s20 +; GPRIDX-NEXT: s_mov_b32 s19, s21 +; GPRIDX-NEXT: s_mov_b32 s20, s22 +; GPRIDX-NEXT: s_mov_b32 s21, s23 +; GPRIDX-NEXT: s_mov_b32 s22, s24 +; GPRIDX-NEXT: s_mov_b32 s23, s25 +; GPRIDX-NEXT: s_mov_b32 s24, s26 +; GPRIDX-NEXT: s_mov_b32 s25, s27 +; GPRIDX-NEXT: s_mov_b32 s26, s28 +; GPRIDX-NEXT: s_mov_b32 s27, s29 +; GPRIDX-NEXT: s_mov_b32 s28, s30 +; GPRIDX-NEXT: s_mov_b32 s29, s31 +; GPRIDX-NEXT: s_mov_b32 s30, s32 +; GPRIDX-NEXT: s_mov_b32 s31, s33 +; GPRIDX-NEXT: s_mov_b32 m0, s35 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movreld_b32 s0, s34 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v32i32_s_s_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 m0, s35 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 +; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s11, s13 +; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s13, s15 +; MOVREL-NEXT: s_mov_b32 s14, s16 +; MOVREL-NEXT: s_mov_b32 s15, s17 +; MOVREL-NEXT: s_mov_b32 s16, s18 +; MOVREL-NEXT: s_mov_b32 s17, s19 +; MOVREL-NEXT: s_mov_b32 s18, s20 +; MOVREL-NEXT: s_mov_b32 s19, s21 +; MOVREL-NEXT: s_mov_b32 s20, s22 +; MOVREL-NEXT: s_mov_b32 s21, s23 +; MOVREL-NEXT: s_mov_b32 s22, s24 +; MOVREL-NEXT: s_mov_b32 s23, s25 +; MOVREL-NEXT: s_mov_b32 s24, s26 +; MOVREL-NEXT: s_mov_b32 s25, s27 +; MOVREL-NEXT: s_mov_b32 s26, s28 +; MOVREL-NEXT: s_mov_b32 s27, s29 +; MOVREL-NEXT: s_mov_b32 s28, s30 +; MOVREL-NEXT: s_mov_b32 s29, s31 +; MOVREL-NEXT: s_mov_b32 s30, s32 +; MOVREL-NEXT: s_mov_b32 s31, s33 +; MOVREL-NEXT: s_movreld_b32 s0, s34 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <32 x i32> %vec, i32 %val, i32 %idx + ret <32 x i32> %insert +} + +define amdgpu_ps <32 x float> @dyn_insertelement_v32i32_v_v_s(<32 x float> %vec, float %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v32i32_v_v_s: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v0, v32 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v32i32_v_v_s: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: v_movreld_b32_e32 v0, v32 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %insert = insertelement <32 x float> %vec, float %val, i32 %idx + ret <32 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_1(<8 x float> inreg %vec, float inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_s_s_s_add_1: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_add_u32 m0, s11, 1 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movreld_b32 s0, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s7 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_s_add_1: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_add_u32 m0, s11, 1 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_movreld_b32 s0, s10 +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: v_mov_b32_e32 v2, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s3 +; MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; MOVREL-NEXT: v_mov_b32_e32 v6, s6 +; MOVREL-NEXT: v_mov_b32_e32 v7, s7 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: ; return to shader part epilog +entry: + %idx.add = add i32 %idx, 1 + %insert = insertelement <8 x float> %vec, float %val, i32 %idx.add + ret <8 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_7(<8 x float> inreg %vec, float inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_s_s_s_add_7: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_add_u32 m0, s11, 7 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movreld_b32 s0, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v7, s7 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_s_add_7: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_add_u32 m0, s11, 7 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_movreld_b32 s0, s10 +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: v_mov_b32_e32 v2, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s3 +; MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; MOVREL-NEXT: v_mov_b32_e32 v6, s6 +; MOVREL-NEXT: v_mov_b32_e32 v7, s7 +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: ; return to shader part epilog +entry: + %idx.add = add i32 %idx, 7 + %insert = insertelement <8 x float> %vec, float %val, i32 %idx.add + ret <8 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_1(<8 x float> %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: v_add_u32_e32 v17, 1, v9 +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB29_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v17 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v17 +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v16, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v15, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v14, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v13, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v12, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v10, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v9, v8 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB29_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v16 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v_add_1: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: v_add_nc_u32_e32 v17, 1, v9 +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB29_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v17 +; MOVREL-NEXT: v_mov_b32_e32 v16, v7 +; MOVREL-NEXT: v_mov_b32_e32 v9, v0 +; MOVREL-NEXT: v_mov_b32_e32 v15, v6 +; MOVREL-NEXT: v_mov_b32_e32 v14, v5 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v17 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_mov_b32_e32 v13, v4 +; MOVREL-NEXT: v_mov_b32_e32 v12, v3 +; MOVREL-NEXT: v_mov_b32_e32 v11, v2 +; MOVREL-NEXT: v_mov_b32_e32 v10, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v9, v8 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB29_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: v_mov_b32_e32 v0, v9 +; MOVREL-NEXT: v_mov_b32_e32 v1, v10 +; MOVREL-NEXT: v_mov_b32_e32 v2, v11 +; MOVREL-NEXT: v_mov_b32_e32 v3, v12 +; MOVREL-NEXT: v_mov_b32_e32 v4, v13 +; MOVREL-NEXT: v_mov_b32_e32 v5, v14 +; MOVREL-NEXT: v_mov_b32_e32 v6, v15 +; MOVREL-NEXT: v_mov_b32_e32 v7, v16 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %idx.add = add i32 %idx, 1 + %insert = insertelement <8 x float> %vec, float %val, i32 %idx.add + ret <8 x float> %insert +} + +define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_v_v_v_add_7(<8 x float> %vec, float %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: v_add_u32_e32 v17, 7, v9 +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB30_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v17 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v17 +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v16, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v15, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v14, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v13, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v12, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v11, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v10, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v9, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v9, v8 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB30_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: v_mov_b32_e32 v0, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v1, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v2, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v3, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v4, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v5, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v6, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v7, v16 +; GPRIDX-NEXT: ; return to shader part epilog +; +; MOVREL-LABEL: dyn_insertelement_v8f32_v_v_v_add_7: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: v_add_nc_u32_e32 v17, 7, v9 +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB30_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v17 +; MOVREL-NEXT: v_mov_b32_e32 v16, v7 +; MOVREL-NEXT: v_mov_b32_e32 v9, v0 +; MOVREL-NEXT: v_mov_b32_e32 v15, v6 +; MOVREL-NEXT: v_mov_b32_e32 v14, v5 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v17 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_mov_b32_e32 v13, v4 +; MOVREL-NEXT: v_mov_b32_e32 v12, v3 +; MOVREL-NEXT: v_mov_b32_e32 v11, v2 +; MOVREL-NEXT: v_mov_b32_e32 v10, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v9, v8 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB30_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: v_mov_b32_e32 v0, v9 +; MOVREL-NEXT: v_mov_b32_e32 v1, v10 +; MOVREL-NEXT: v_mov_b32_e32 v2, v11 +; MOVREL-NEXT: v_mov_b32_e32 v3, v12 +; MOVREL-NEXT: v_mov_b32_e32 v4, v13 +; MOVREL-NEXT: v_mov_b32_e32 v5, v14 +; MOVREL-NEXT: v_mov_b32_e32 v6, v15 +; MOVREL-NEXT: v_mov_b32_e32 v7, v16 +; MOVREL-NEXT: ; return to shader part epilog +entry: + %idx.add = add i32 %idx, 7 + %insert = insertelement <8 x float> %vec, float %val, i32 %idx.add + ret <8 x float> %insert +} + +define amdgpu_ps void @dyn_insertelement_v8f64_s_s_s_add_1(<8 x double> inreg %vec, double inreg %val, i32 inreg %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f64_s_s_s_add_1: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_add_u32 m0, s20, 1 +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: s_movreld_b64 s[0:1], s[18:19] +; GPRIDX-NEXT: v_mov_b32_e32 v0, s0 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s3 +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s4 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s5 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s6 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s7 +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s8 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s9 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s10 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s11 +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: s_nop 0 +; GPRIDX-NEXT: v_mov_b32_e32 v0, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v1, s13 +; GPRIDX-NEXT: v_mov_b32_e32 v2, s14 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s15 +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GPRIDX-NEXT: s_endpgm +; +; MOVREL-LABEL: dyn_insertelement_v8f64_s_s_s_add_1: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_add_u32 m0, s20, 1 +; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 +; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s11, s13 +; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s13, s15 +; MOVREL-NEXT: s_mov_b32 s14, s16 +; MOVREL-NEXT: s_mov_b32 s15, s17 +; MOVREL-NEXT: s_movreld_b64 s[0:1], s[18:19] +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 +; MOVREL-NEXT: v_mov_b32_e32 v4, s4 +; MOVREL-NEXT: v_mov_b32_e32 v8, s8 +; MOVREL-NEXT: v_mov_b32_e32 v12, s12 +; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: v_mov_b32_e32 v2, s2 +; MOVREL-NEXT: v_mov_b32_e32 v3, s3 +; MOVREL-NEXT: v_mov_b32_e32 v5, s5 +; MOVREL-NEXT: v_mov_b32_e32 v6, s6 +; MOVREL-NEXT: v_mov_b32_e32 v7, s7 +; MOVREL-NEXT: v_mov_b32_e32 v9, s9 +; MOVREL-NEXT: v_mov_b32_e32 v10, s10 +; MOVREL-NEXT: v_mov_b32_e32 v11, s11 +; MOVREL-NEXT: v_mov_b32_e32 v13, s13 +; MOVREL-NEXT: v_mov_b32_e32 v14, s14 +; MOVREL-NEXT: v_mov_b32_e32 v15, s15 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[8:11], off +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[12:15], off +; MOVREL-NEXT: s_endpgm +entry: + %idx.add = add i32 %idx, 1 + %insert = insertelement <8 x double> %vec, double %val, i32 %idx.add + %vec.0 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.1 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.2 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.3 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + store volatile <2 x double> %vec.0, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.1, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.2, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef + ret void +} + +define amdgpu_ps void @dyn_insertelement_v8f64_v_v_v_add_1(<8 x double> %vec, double %val, i32 %idx) { +; GPRIDX-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: +; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: v_add_u32_e32 v18, 1, v18 +; GPRIDX-NEXT: s_mov_b64 s[0:1], exec +; GPRIDX-NEXT: BB32_1: ; =>This Inner Loop Header: Depth=1 +; GPRIDX-NEXT: v_readfirstlane_b32 s2, v18 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, s2, v18 +; GPRIDX-NEXT: s_lshl_b32 s2, s2, 1 +; GPRIDX-NEXT: s_add_u32 s3, s2, 1 +; GPRIDX-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v34, v15 +; GPRIDX-NEXT: v_mov_b32_e32 v33, v14 +; GPRIDX-NEXT: v_mov_b32_e32 v32, v13 +; GPRIDX-NEXT: v_mov_b32_e32 v31, v12 +; GPRIDX-NEXT: v_mov_b32_e32 v30, v11 +; GPRIDX-NEXT: v_mov_b32_e32 v29, v10 +; GPRIDX-NEXT: v_mov_b32_e32 v28, v9 +; GPRIDX-NEXT: v_mov_b32_e32 v27, v8 +; GPRIDX-NEXT: v_mov_b32_e32 v26, v7 +; GPRIDX-NEXT: v_mov_b32_e32 v25, v6 +; GPRIDX-NEXT: v_mov_b32_e32 v24, v5 +; GPRIDX-NEXT: v_mov_b32_e32 v23, v4 +; GPRIDX-NEXT: v_mov_b32_e32 v22, v3 +; GPRIDX-NEXT: v_mov_b32_e32 v21, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v20, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v19, v16 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_set_gpr_idx_on s3, gpr_idx(DST) +; GPRIDX-NEXT: v_mov_b32_e32 v19, v17 +; GPRIDX-NEXT: s_set_gpr_idx_off +; GPRIDX-NEXT: s_and_saveexec_b64 vcc, vcc +; GPRIDX-NEXT: s_xor_b64 exec, exec, vcc +; GPRIDX-NEXT: s_cbranch_execnz BB32_1 +; GPRIDX-NEXT: ; %bb.2: +; GPRIDX-NEXT: s_mov_b64 exec, s[0:1] +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; GPRIDX-NEXT: s_endpgm +; +; MOVREL-LABEL: dyn_insertelement_v8f64_v_v_v_add_1: +; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18 +; MOVREL-NEXT: s_mov_b32 s0, exec_lo +; MOVREL-NEXT: ; implicit-def: $vcc_hi +; MOVREL-NEXT: BB32_1: ; =>This Inner Loop Header: Depth=1 +; MOVREL-NEXT: v_readfirstlane_b32 s1, v18 +; MOVREL-NEXT: v_mov_b32_e32 v34, v15 +; MOVREL-NEXT: v_mov_b32_e32 v19, v0 +; MOVREL-NEXT: v_mov_b32_e32 v33, v14 +; MOVREL-NEXT: v_mov_b32_e32 v32, v13 +; MOVREL-NEXT: s_lshl_b32 s2, s1, 1 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v18 +; MOVREL-NEXT: v_mov_b32_e32 v31, v12 +; MOVREL-NEXT: v_mov_b32_e32 v30, v11 +; MOVREL-NEXT: v_mov_b32_e32 v29, v10 +; MOVREL-NEXT: s_add_u32 s1, s2, 1 +; MOVREL-NEXT: s_mov_b32 m0, s2 +; MOVREL-NEXT: v_mov_b32_e32 v28, v9 +; MOVREL-NEXT: v_mov_b32_e32 v27, v8 +; MOVREL-NEXT: v_mov_b32_e32 v26, v7 +; MOVREL-NEXT: v_mov_b32_e32 v25, v6 +; MOVREL-NEXT: v_mov_b32_e32 v24, v5 +; MOVREL-NEXT: v_mov_b32_e32 v23, v4 +; MOVREL-NEXT: v_mov_b32_e32 v22, v3 +; MOVREL-NEXT: v_mov_b32_e32 v21, v2 +; MOVREL-NEXT: v_mov_b32_e32 v20, v1 +; MOVREL-NEXT: v_movreld_b32_e32 v19, v16 +; MOVREL-NEXT: s_mov_b32 m0, s1 +; MOVREL-NEXT: v_movreld_b32_e32 v19, v17 +; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo +; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo +; MOVREL-NEXT: s_cbranch_execnz BB32_1 +; MOVREL-NEXT: ; %bb.2: +; MOVREL-NEXT: s_mov_b32 exec_lo, s0 +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[19:22], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[23:26], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[27:30], off +; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[31:34], off +; MOVREL-NEXT: s_endpgm +entry: + %idx.add = add i32 %idx, 1 + %insert = insertelement <8 x double> %vec, double %val, i32 %idx.add + %vec.0 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.1 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.2 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + %vec.3 = shufflevector <8 x double> %insert, <8 x double> undef, <2 x i32> + store volatile <2 x double> %vec.0, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.1, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.2, <2 x double> addrspace(1)* undef + store volatile <2 x double> %vec.3, <2 x double> addrspace(1)* undef + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-insert-vector-elt.mir @@ -0,0 +1,634 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=MOVREL %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GPRIDX %s + +--- +name: insert_vector_elt_s_s32_v2s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2, $sgpr3 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s32_v2s32 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr0_sgpr1 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V2_:%[0-9]+]]:sreg_64_xexec = S_INDIRECT_REG_WRITE_B32_V2 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V2_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v2s32 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY $sgpr0_sgpr1 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V2_:%[0-9]+]]:sreg_64_xexec = S_INDIRECT_REG_WRITE_B32_V2 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V2_]] + %0:sgpr(<2 x s32>) = COPY $sgpr0_sgpr1 + %1:sgpr(s32) = COPY $sgpr2 + %2:sgpr(s32) = COPY $sgpr3 + %3:sgpr(<2 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_s_s32_v3s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2, $sgpr3, $sgpr4 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s32_v3s32 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_96 = COPY $sgpr0_sgpr1_sgpr2 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V3_:%[0-9]+]]:sreg_96 = S_INDIRECT_REG_WRITE_B32_V3 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V3_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v3s32 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_96 = COPY $sgpr0_sgpr1_sgpr2 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V3_:%[0-9]+]]:sreg_96 = S_INDIRECT_REG_WRITE_B32_V3 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V3_]] + %0:sgpr(<3 x s32>) = COPY $sgpr0_sgpr1_sgpr2 + %1:sgpr(s32) = COPY $sgpr3 + %2:sgpr(s32) = COPY $sgpr4 + %3:sgpr(<3 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_s_s32_v4s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $sgpr5 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s32_v4s32 + ; MOVREL: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V4_:%[0-9]+]]:sgpr_128 = S_INDIRECT_REG_WRITE_B32_V4 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V4_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v4s32 + ; GPRIDX: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V4_:%[0-9]+]]:sgpr_128 = S_INDIRECT_REG_WRITE_B32_V4 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V4_]] + %0:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr(s32) = COPY $sgpr3 + %2:sgpr(s32) = COPY $sgpr4 + %3:sgpr(<4 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_s_s32_v5s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, $sgpr5, $sgpr6 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s32_v5s32 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_160 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V5_:%[0-9]+]]:sreg_160 = S_INDIRECT_REG_WRITE_B32_V5 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V5_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v5s32 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_160 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V5_:%[0-9]+]]:sreg_160 = S_INDIRECT_REG_WRITE_B32_V5 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V5_]] + %0:sgpr(<5 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + %1:sgpr(s32) = COPY $sgpr5 + %2:sgpr(s32) = COPY $sgpr6 + %3:sgpr(<5 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_s_s32_v8s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, $sgpr9 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s32_v8s32 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v8s32 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]] + %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + %1:sgpr(s32) = COPY $sgpr8 + %2:sgpr(s32) = COPY $sgpr9 + %3:sgpr(<8 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_s_s32_v16s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16, $sgpr17 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s32_v16s32 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V16_:%[0-9]+]]:sreg_512 = S_INDIRECT_REG_WRITE_B32_V16 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V16_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v16s32 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr16 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr17 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V16_:%[0-9]+]]:sreg_512 = S_INDIRECT_REG_WRITE_B32_V16 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V16_]] + %0:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:sgpr(s32) = COPY $sgpr16 + %2:sgpr(s32) = COPY $sgpr17 + %3:sgpr(<16 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: extract_vector_elt_s_s32_v32s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, $sgpr40, $sgpr41 + + ; MOVREL-LABEL: name: extract_vector_elt_s_s32_v32s32 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_1024 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr40 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr41 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V32_:%[0-9]+]]:sreg_1024 = S_INDIRECT_REG_WRITE_B32_V32 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V32_]] + ; GPRIDX-LABEL: name: extract_vector_elt_s_s32_v32s32 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_1024 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr40 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr41 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V32_:%[0-9]+]]:sreg_1024 = S_INDIRECT_REG_WRITE_B32_V32 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V32_]] + %0:sgpr(<32 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + %1:sgpr(s32) = COPY $sgpr40 + %2:sgpr(s32) = COPY $sgpr41 + %3:sgpr(<32 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_s_s64_v2s64 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4_sgpr5, $sgpr6 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s64_v2s64 + ; MOVREL: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B64_V2_:%[0-9]+]]:sgpr_128 = S_INDIRECT_REG_WRITE_B64_V2 [[COPY]], [[COPY1]], 35, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B64_V2_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s64_v2s64 + ; GPRIDX: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr4_sgpr5 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B64_V2_:%[0-9]+]]:sgpr_128 = S_INDIRECT_REG_WRITE_B64_V2 [[COPY]], [[COPY1]], 35, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B64_V2_]] + %0:sgpr(<2 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:sgpr(s64) = COPY $sgpr4_sgpr5 + %2:sgpr(s32) = COPY $sgpr6 + %3:sgpr(<2 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_s_s64_v4s64 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s64_v4s64 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr10 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B64_V4_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B64_V4 [[COPY]], [[COPY1]], 35, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B64_V4_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s64_v4s64 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr8_sgpr9 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr10 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B64_V4_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B64_V4 [[COPY]], [[COPY1]], 35, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B64_V4_]] + %0:sgpr(<4 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + %1:sgpr(s64) = COPY $sgpr8_sgpr9 + %2:sgpr(s32) = COPY $sgpr10 + %3:sgpr(<4 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_s_s64_v8s64 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s64_v8s64 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr16_sgpr17 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B64_V8_:%[0-9]+]]:sreg_512 = S_INDIRECT_REG_WRITE_B64_V8 [[COPY]], [[COPY1]], 35, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B64_V8_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s64_v8s64 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_512 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr16_sgpr17 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr18 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B64_V8_:%[0-9]+]]:sreg_512 = S_INDIRECT_REG_WRITE_B64_V8 [[COPY]], [[COPY1]], 35, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B64_V8_]] + %0:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:sgpr(s64) = COPY $sgpr16_sgpr17 + %2:sgpr(s32) = COPY $sgpr18 + %3:sgpr(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: extract_vector_elt_s_s64_v16s64 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr42 + + ; MOVREL-LABEL: name: extract_vector_elt_s_s64_v16s64 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_1024 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr40_sgpr41 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr42 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B64_V16_:%[0-9]+]]:sreg_1024 = S_INDIRECT_REG_WRITE_B64_V16 [[COPY]], [[COPY1]], 35, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B64_V16_]] + ; GPRIDX-LABEL: name: extract_vector_elt_s_s64_v16s64 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_1024 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY $sgpr40_sgpr41 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr42 + ; GPRIDX: $m0 = COPY [[COPY2]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B64_V16_:%[0-9]+]]:sreg_1024 = S_INDIRECT_REG_WRITE_B64_V16 [[COPY]], [[COPY1]], 35, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B64_V16_]] + %0:sgpr(<16 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + %1:sgpr(s64) = COPY $sgpr40_sgpr41 + %2:sgpr(s32) = COPY $sgpr42 + %3:sgpr(<16 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_vvs_s32_v2s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2, $sgpr3 + + ; MOVREL-LABEL: name: insert_vector_elt_vvs_s32_v2s32 + ; MOVREL: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; MOVREL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V2_:%[0-9]+]]:vreg_64 = V_INDIRECT_REG_WRITE_B32_V2 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; MOVREL: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V2_]] + ; GPRIDX-LABEL: name: insert_vector_elt_vvs_s32_v2s32 + ; GPRIDX: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GPRIDX: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; GPRIDX: S_SET_GPR_IDX_ON [[COPY2]], 8, implicit-def $m0, implicit $m0 + ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V2_:%[0-9]+]]:vreg_64 = V_INDIRECT_REG_WRITE_B32_V2 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; GPRIDX: S_SET_GPR_IDX_OFF + ; GPRIDX: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V2_]] + %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:vgpr(s32) = COPY $vgpr2 + %2:sgpr(s32) = COPY $sgpr3 + %3:vgpr(<2 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_vvs_s32_v3s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2, $vgpr3, $sgpr4 + + ; MOVREL-LABEL: name: insert_vector_elt_vvs_s32_v3s32 + ; MOVREL: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 + ; MOVREL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V3_:%[0-9]+]]:vreg_96 = V_INDIRECT_REG_WRITE_B32_V3 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; MOVREL: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V3_]] + ; GPRIDX-LABEL: name: insert_vector_elt_vvs_s32_v3s32 + ; GPRIDX: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2 + ; GPRIDX: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GPRIDX: S_SET_GPR_IDX_ON [[COPY2]], 8, implicit-def $m0, implicit $m0 + ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V3_:%[0-9]+]]:vreg_96 = V_INDIRECT_REG_WRITE_B32_V3 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; GPRIDX: S_SET_GPR_IDX_OFF + ; GPRIDX: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V3_]] + %0:vgpr(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + %1:vgpr(s32) = COPY $vgpr3 + %2:sgpr(s32) = COPY $sgpr4 + %3:vgpr(<3 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_vvs_s32_v4s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 + + ; MOVREL-LABEL: name: insert_vector_elt_vvs_s32_v4s32 + ; MOVREL: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; MOVREL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V4_:%[0-9]+]]:vreg_128 = V_INDIRECT_REG_WRITE_B32_V4 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; MOVREL: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V4_]] + ; GPRIDX-LABEL: name: insert_vector_elt_vvs_s32_v4s32 + ; GPRIDX: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; GPRIDX: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; GPRIDX: S_SET_GPR_IDX_ON [[COPY2]], 8, implicit-def $m0, implicit $m0 + ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V4_:%[0-9]+]]:vreg_128 = V_INDIRECT_REG_WRITE_B32_V4 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; GPRIDX: S_SET_GPR_IDX_OFF + ; GPRIDX: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V4_]] + %0:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:vgpr(s32) = COPY $vgpr3 + %2:sgpr(s32) = COPY $sgpr4 + %3:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_vvs_s32_v5s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4, $vgpr5, $sgpr6 + + ; MOVREL-LABEL: name: insert_vector_elt_vvs_s32_v5s32 + ; MOVREL: [[COPY:%[0-9]+]]:vreg_160 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; MOVREL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V5_:%[0-9]+]]:vreg_160 = V_INDIRECT_REG_WRITE_B32_V5 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; MOVREL: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V5_]] + ; GPRIDX-LABEL: name: insert_vector_elt_vvs_s32_v5s32 + ; GPRIDX: [[COPY:%[0-9]+]]:vreg_160 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + ; GPRIDX: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr6 + ; GPRIDX: S_SET_GPR_IDX_ON [[COPY2]], 8, implicit-def $m0, implicit $m0 + ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V5_:%[0-9]+]]:vreg_160 = V_INDIRECT_REG_WRITE_B32_V5 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; GPRIDX: S_SET_GPR_IDX_OFF + ; GPRIDX: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V5_]] + %0:vgpr(<5 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 + %1:vgpr(s32) = COPY $vgpr5 + %2:sgpr(s32) = COPY $sgpr6 + %3:vgpr(<5 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_vvs_s32_v8s32 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $sgpr9 + + ; MOVREL-LABEL: name: insert_vector_elt_vvs_s32_v8s32 + ; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MOVREL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; MOVREL: $m0 = COPY [[COPY2]] + ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; MOVREL: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]] + ; GPRIDX-LABEL: name: insert_vector_elt_vvs_s32_v8s32 + ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GPRIDX: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; GPRIDX: S_SET_GPR_IDX_ON [[COPY2]], 8, implicit-def $m0, implicit $m0 + ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; GPRIDX: S_SET_GPR_IDX_OFF + ; GPRIDX: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]] + %0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:vgpr(s32) = COPY $vgpr8 + %2:sgpr(s32) = COPY $sgpr9 + %3:vgpr(<8 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 + S_ENDPGM 0, implicit %3 +... + +--- +name: insert_vector_elt_vvs_s32_v8s32_add_1 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $sgpr9 + + ; MOVREL-LABEL: name: insert_vector_elt_vvs_s32_v8s32_add_1 + ; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MOVREL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc + ; MOVREL: $m0 = COPY [[S_ADD_U32_]] + ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; MOVREL: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]] + ; GPRIDX-LABEL: name: insert_vector_elt_vvs_s32_v8s32_add_1 + ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GPRIDX: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc + ; GPRIDX: S_SET_GPR_IDX_ON [[S_ADD_U32_]], 8, implicit-def $m0, implicit $m0 + ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; GPRIDX: S_SET_GPR_IDX_OFF + ; GPRIDX: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]] + %0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:vgpr(s32) = COPY $vgpr8 + %2:sgpr(s32) = COPY $sgpr9 + %3:sgpr(s32) = G_CONSTANT i32 1 + %4:sgpr(s32) = G_ADD %2, %3 + %5:vgpr(<8 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %4 + S_ENDPGM 0, implicit %5 +... + +--- +name: insert_vector_elt_vvs_s32_v8s32_add_8 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8, $sgpr9 + + ; MOVREL-LABEL: name: insert_vector_elt_vvs_s32_v8s32_add_8 + ; MOVREL: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; MOVREL: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8 + ; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc + ; MOVREL: $m0 = COPY [[S_ADD_U32_]] + ; MOVREL: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; MOVREL: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]] + ; GPRIDX-LABEL: name: insert_vector_elt_vvs_s32_v8s32_add_8 + ; GPRIDX: [[COPY:%[0-9]+]]:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GPRIDX: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8 + ; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc + ; GPRIDX: S_SET_GPR_IDX_ON [[S_ADD_U32_]], 8, implicit-def $m0, implicit $m0 + ; GPRIDX: [[V_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:vreg_256 = V_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0, implicit $exec + ; GPRIDX: S_SET_GPR_IDX_OFF + ; GPRIDX: S_ENDPGM 0, implicit [[V_INDIRECT_REG_WRITE_B32_V8_]] + %0:vgpr(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:vgpr(s32) = COPY $vgpr8 + %2:sgpr(s32) = COPY $sgpr9 + %3:sgpr(s32) = G_CONSTANT i32 8 + %4:sgpr(s32) = G_ADD %2, %3 + %5:vgpr(<8 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %4 + S_ENDPGM 0, implicit %5 +... + +--- +name: insert_vector_elt_s_s32_v8s32_add_1 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, $sgpr9 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s32_v8s32_add_1 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc + ; MOVREL: $m0 = COPY [[S_ADD_U32_]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v8s32_add_1 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 + ; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc + ; GPRIDX: $m0 = COPY [[S_ADD_U32_]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]] + %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + %1:sgpr(s32) = COPY $sgpr8 + %2:sgpr(s32) = COPY $sgpr9 + %3:sgpr(s32) = G_CONSTANT i32 1 + %4:sgpr(s32) = G_ADD %2, %3 + %5:sgpr(<8 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %4 + S_ENDPGM 0, implicit %5 +... + +--- +name: insert_vector_elt_s_s32_v8s32_add_8 +legalized: true +regBankSelected: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $sgpr8, $sgpr9 + + ; MOVREL-LABEL: name: insert_vector_elt_s_s32_v8s32_add_8 + ; MOVREL: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; MOVREL: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; MOVREL: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; MOVREL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8 + ; MOVREL: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc + ; MOVREL: $m0 = COPY [[S_ADD_U32_]] + ; MOVREL: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0 + ; MOVREL: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]] + ; GPRIDX-LABEL: name: insert_vector_elt_s_s32_v8s32_add_8 + ; GPRIDX: [[COPY:%[0-9]+]]:sreg_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GPRIDX: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr8 + ; GPRIDX: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9 + ; GPRIDX: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 8 + ; GPRIDX: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], [[S_MOV_B32_]], implicit-def $scc + ; GPRIDX: $m0 = COPY [[S_ADD_U32_]] + ; GPRIDX: [[S_INDIRECT_REG_WRITE_B32_V8_:%[0-9]+]]:sreg_256 = S_INDIRECT_REG_WRITE_B32_V8 [[COPY]], [[COPY1]], 1, implicit $m0 + ; GPRIDX: S_ENDPGM 0, implicit [[S_INDIRECT_REG_WRITE_B32_V8_]] + %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + %1:sgpr(s32) = COPY $sgpr8 + %2:sgpr(s32) = COPY $sgpr9 + %3:sgpr(s32) = G_CONSTANT i32 8 + %4:sgpr(s32) = G_ADD %2, %3 + %5:sgpr(<8 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %4 + S_ENDPGM 0, implicit %5 +...