Index: llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -318,6 +318,10 @@ LegalizeResult bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx, LLT CastTy); + /// Perform Bitcast legalize action on G_INSERT_VECTOR_ELT. + LegalizeResult bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy); + LegalizeResult lowerBitcast(MachineInstr &MI); LegalizeResult lowerLoad(MachineInstr &MI); LegalizeResult lowerStore(MachineInstr &MI); Index: llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2361,6 +2361,28 @@ return UnableToLegalize; } +/// Figure out the bit offset into a register when coercing a vector index for +/// the wide element type. This is only for the case when promoting vector to +/// one with larger elements. +// +/// +/// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize)) +/// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize) +static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B, + Register Idx, + unsigned NewEltSize, + unsigned OldEltSize) { + const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); + LLT IdxTy = B.getMRI()->getType(Idx); + + // Now figure out the amount we need to shift to get the target bits. + auto OffsetMask = B.buildConstant( + IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio)); + auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask); + return B.buildShl(IdxTy, OffsetIdx, + B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0); +} + /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this /// is casting to a vector with a smaller element size, perform multiple element /// extracts and merge the results. If this is coercing to a vector with larger @@ -2459,13 +2481,9 @@ ScaledIdx).getReg(0); } - // Now figure out the amount we need to shift to get the target bits. - auto OffsetMask = MIRBuilder.buildConstant( - IdxTy, ~(APInt::getAllOnesValue(IdxTy.getSizeInBits()) << Log2EltRatio)); - auto OffsetIdx = MIRBuilder.buildAnd(IdxTy, Idx, OffsetMask); - auto OffsetBits = MIRBuilder.buildShl( - IdxTy, OffsetIdx, - MIRBuilder.buildConstant(IdxTy, Log2_32(OldEltSize))); + // Compute the bit offset into the register of the target element. + Register OffsetBits = getBitcastWiderVectorElementOffset( + MIRBuilder, Idx, NewEltSize, OldEltSize); // Shift the wide element to get the target element. auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits); @@ -2477,6 +2495,104 @@ return UnableToLegalize; } +/// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p +/// TargetReg, while preserving other bits in \p TargetReg. +/// +/// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset) +static Register buildBitFieldInsert(MachineIRBuilder &B, + Register TargetReg, Register InsertReg, + Register OffsetBits) { + LLT TargetTy = B.getMRI()->getType(TargetReg); + LLT InsertTy = B.getMRI()->getType(InsertReg); + auto ZextVal = B.buildZExt(TargetTy, InsertReg); + auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits); + + // Produce a bitmask of the value to insert + auto EltMask = B.buildConstant( + TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(), + InsertTy.getSizeInBits())); + // Shift it into position + auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits); + auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask); + + // Clear out the bits in the wide element + auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask); + + // The value to insert has all zeros already, so stick it into the masked + // wide element. + return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0); +} + +/// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this +/// is increasing the element size, perform the indexing in the target element +/// type, and use bit operations to insert at the element position. This is +/// intended for architectures that can dynamically index the register file and +/// want to force indexing in the native register size. +LegalizerHelper::LegalizeResult +LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx, + LLT CastTy) { + if (TypeIdx != 0) + return UnableToLegalize; + + Register Dst = MI.getOperand(0).getReg(); + Register SrcVec = MI.getOperand(1).getReg(); + Register Val = MI.getOperand(2).getReg(); + Register Idx = MI.getOperand(3).getReg(); + + LLT VecTy = MRI.getType(Dst); + LLT ValTy = MRI.getType(Val); + LLT IdxTy = MRI.getType(Idx); + + LLT VecEltTy = VecTy.getElementType(); + LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy; + const unsigned NewEltSize = NewEltTy.getSizeInBits(); + const unsigned OldEltSize = VecEltTy.getSizeInBits(); + + unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1; + unsigned OldNumElts = VecTy.getNumElements(); + + Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0); + if (NewNumElts < OldNumElts) { + if (NewEltSize % OldEltSize != 0) + return UnableToLegalize; + + // This only depends on powers of 2 because we use bit tricks to figure out + // the bit offset we need to shift to get the target element. A general + // expansion could emit division/multiply. + if (!isPowerOf2_32(NewEltSize / OldEltSize)) + return UnableToLegalize; + + const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize); + auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio); + + // Divide to get the index in the wider element type. + auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio); + + Register ExtractedElt = CastVec; + if (CastTy.isVector()) { + ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, + ScaledIdx).getReg(0); + } + + // Compute the bit offset into the register of the target element. + Register OffsetBits = getBitcastWiderVectorElementOffset( + MIRBuilder, Idx, NewEltSize, OldEltSize); + + Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt, + Val, OffsetBits); + if (CastTy.isVector()) { + InsertedElt = MIRBuilder.buildInsertVectorElement( + CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0); + } + + MIRBuilder.buildBitcast(Dst, InsertedElt); + MI.eraseFromParent(); + return Legalized; + } + + return UnableToLegalize; +} + LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(MachineInstr &MI) { // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT @@ -2666,6 +2782,8 @@ } case TargetOpcode::G_EXTRACT_VECTOR_ELT: return bitcastExtractVectorElt(MI, TypeIdx, CastTy); + case TargetOpcode::G_INSERT_VECTOR_ELT: + return bitcastInsertVectorElt(MI, TypeIdx, CastTy); default: return UnableToLegalize; } Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1319,11 +1319,11 @@ VecTy.getSizeInBits() <= MaxRegisterSize && IdxTy.getSizeInBits() == 32; }) - .bitcastIf(all(sizeIsMultipleOf32(1), scalarOrEltNarrowerThan(1, 32)), - bitcastToVectorElement32(1)) + .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), + bitcastToVectorElement32(VecTypeIdx)) //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) .bitcastIf( - all(sizeIsMultipleOf32(1), scalarOrEltWiderThan(1, 64)), + all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), [=](const LegalityQuery &Query) { // For > 64-bit element types, try to turn this into a 64-bit // element vector since we may be able to do better indexing Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -0,0 +1,3306 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v2i16_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_and_b32 s1, s5, 1 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_and_b32 s3, s4, s2 +; GFX9-NEXT: s_lshl_b32 s3, s3, s1 +; GFX9-NEXT: s_lshl_b32 s1, s2, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_andn2_b32 s0, s0, s1 +; GFX9-NEXT: s_or_b32 s0, s0, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v2i16_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: s_and_b32 s1, s5, 1 +; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_and_b32 s3, s4, s2 +; GFX8-NEXT: s_lshl_b32 s3, s3, s1 +; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_andn2_b32 s0, s0, s1 +; GFX8-NEXT: s_or_b32 s0, s0, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v2i16_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_and_b32 s1, s5, 1 +; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: s_and_b32 s3, s4, s2 +; GFX7-NEXT: s_lshl_b32 s3, s3, s1 +; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_andn2_b32 s0, s0, s1 +; GFX7-NEXT: s_or_b32 s0, s0, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr + %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx + store <2 x i16> %insert, <2 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v2i16_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_and_b32 s1, s3, 1 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s2, s2, s0 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v2i16_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_and_b32 s1, s3, 1 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_and_b32 s2, s2, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v2, s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v2i16_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_and_b32 s1, s3, 1 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: s_and_b32 s2, s2, s0 +; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v2, s2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i16>, <2 x i16> addrspace(1 )* %ptr + %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx + store <2 x i16> %insert, <2 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v2i16_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: s_and_b32 s1, s4, 1 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_andn2_b32 s0, s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_lshl_or_b32 v2, v0, s1, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v2i16_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: s_and_b32 s1, s4, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_andn2_b32 s0, s0, s1 +; GFX8-NEXT: v_or_b32_e32 v2, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v2i16_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_and_b32 s1, s4, 1 +; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_andn2_b32 s0, s0, s1 +; GFX7-NEXT: v_or_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr + %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx + store <2 x i16> %insert, <2 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v2i16_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_mov_b32 s1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-NEXT: s_and_b32 s2, s4, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v0, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, s0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v2i16_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_mov_b32 s1, 0xffff +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-NEXT: s_and_b32 s2, s4, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s2 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v2i16_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: s_mov_b32 s1, 0xffff +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX7-NEXT: s_and_b32 s2, s4, s1 +; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr + %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx + store <2 x i16> %insert, <2 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v2i16_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: s_mov_b32 s1, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s1 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, s0, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v2i16_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: s_mov_b32 s1, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s1 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v2i16_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_mov_b32 s1, 0xffff +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr + %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx + store <2 x i16> %insert, <2 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v2i16_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v2i16_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v2i16_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr + %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx + store <2 x i16> %insert, <2 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v2i16_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v2i16_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v2i16_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s1, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr + %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx + store <2 x i16> %insert, <2 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v2i16_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_or_b32 v2, v0, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v2i16_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v2i16_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr + %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx + store <2 x i16> %insert, <2 x i16> addrspace(1)* null + ret void +} + +; FIXME: 3 element load/store legalization +; define amdgpu_ps void @insertelement_s_v3i16_s_s(<3 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { +; %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr +; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx +; store <3 x i16> %insert, <3 x i16> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_v_v3i16_s_s(<3 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { +; %vec = load <3 x i16>, <3 x i16> addrspace(1 )* %ptr +; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx +; store <3 x i16> %insert, <3 x i16> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_s_v3i16_v_s(<3 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { +; %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr +; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx +; store <3 x i16> %insert, <3 x i16> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_s_v3i16_s_v(<3 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { +; %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr +; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx +; store <3 x i16> %insert, <3 x i16> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_s_v3i16_v_v(<3 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { +; %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr +; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx +; store <3 x i16> %insert, <3 x i16> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_v_v3i16_s_v(<3 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { +; %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr +; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx +; store <3 x i16> %insert, <3 x i16> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_v_v3i16_v_s(<3 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { +; %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr +; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx +; store <3 x i16> %insert, <3 x i16> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_v_v3i16_v_v(<3 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { +; %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr +; %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx +; store <3 x i16> %insert, <3 x i16> addrspace(1)* null +; ret void +; } + +define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v4i16_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_lshr_b32 s1, s3, 1 +; GFX9-NEXT: s_and_b32 s3, s3, 1 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s2, s2, s0 +; GFX9-NEXT: s_lshl_b32 s3, s3, 4 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v3, s0, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v4i16_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_lshr_b32 s1, s3, 1 +; GFX8-NEXT: s_and_b32 s3, s3, 1 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_lshl_b32 s3, s3, 4 +; GFX8-NEXT: s_and_b32 s2, s2, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v4i16_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_lshr_b32 s1, s3, 1 +; GFX7-NEXT: s_and_b32 s3, s3, 1 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshl_b32 s3, s3, 4 +; GFX7-NEXT: s_and_b32 s2, s2, s0 +; GFX7-NEXT: s_lshl_b32 s0, s0, s3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: s_lshl_b32 s2, s2, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm + %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr + %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx + store <4 x i16> %insert, <4 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v4i16_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_lshr_b32 s2, s4, 1 +; GFX9-NEXT: s_cmp_eq_u32 s2, 1 +; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cselect_b32 s3, s1, s0 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_lshl_b32 s4, s4, 4 +; GFX9-NEXT: s_lshl_b32 s5, s5, s4 +; GFX9-NEXT: s_andn2_b32 s3, s3, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_lshl_or_b32 v2, v0, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v4i16_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_lshr_b32 s2, s4, 1 +; GFX8-NEXT: s_cmp_eq_u32 s2, 1 +; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_cselect_b32 s3, s1, s0 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_lshl_b32 s4, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_lshl_b32 s4, s5, s4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: s_andn2_b32 s3, s3, s4 +; GFX8-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v4i16_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_lshr_b32 s2, s4, 1 +; GFX7-NEXT: s_cmp_eq_u32 s2, 1 +; GFX7-NEXT: s_mov_b32 s5, 0xffff +; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_cselect_b32 s3, s1, s0 +; GFX7-NEXT: s_and_b32 s4, s4, 1 +; GFX7-NEXT: s_lshl_b32 s4, s4, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX7-NEXT: s_lshl_b32 s4, s5, s4 +; GFX7-NEXT: s_andn2_b32 s3, s3, s4 +; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm + %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx + store <4 x i16> %insert, <4 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v4i16_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-NEXT: s_and_b32 s3, s4, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v4i16_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-NEXT: s_and_b32 s3, s4, s2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v4i16_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX7-NEXT: s_and_b32 s3, s4, s2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshl_b32_e32 v3, s3, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm + %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx + store <4 x i16> %insert, <4 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v4i16_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v4i16_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v4i16_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm + %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr + %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx + store <4 x i16> %insert, <4 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v4i16_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: v_lshlrev_b32_e64 v4, v2, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v5, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v4i16_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX8-NEXT: v_lshlrev_b32_e64 v4, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v4i16_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, s1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm + %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx + store <4 x i16> %insert, <4 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v4i16_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_lshr_b32 s1, s2, 1 +; GFX9-NEXT: s_and_b32 s2, s2, 1 +; GFX9-NEXT: s_lshl_b32 s2, s2, 4 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v3, s0, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v4i16_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_lshr_b32 s1, s2, 1 +; GFX8-NEXT: s_and_b32 s2, s2, 1 +; GFX8-NEXT: s_lshl_b32 s2, s2, 4 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v4i16_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: s_lshr_b32 s1, s2, 1 +; GFX7-NEXT: s_and_b32 s2, s2, 1 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshl_b32 s2, s2, 4 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, s2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_not_b32 s0, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm + %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx + store <4 x i16> %insert, <4 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v4i16_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v5, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v4i16_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v4i16_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm + %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr + %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx + store <4 x i16> %insert, <4 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v8i16_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_lshr_b32 s6, s5, 1 +; GFX9-NEXT: s_cmp_eq_u32 s6, 1 +; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cselect_b32 s7, s1, s0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 2 +; GFX9-NEXT: s_cselect_b32 s7, s2, s7 +; GFX9-NEXT: s_cmp_eq_u32 s6, 3 +; GFX9-NEXT: s_cselect_b32 s7, s3, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 1 +; GFX9-NEXT: s_lshl_b32 s5, s5, 4 +; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_lshl_b32 s4, s4, s5 +; GFX9-NEXT: s_lshl_b32 s5, s8, s5 +; GFX9-NEXT: s_andn2_b32 s5, s7, s5 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: s_cselect_b32 s0, s4, s0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 1 +; GFX9-NEXT: s_cselect_b32 s1, s4, s1 +; GFX9-NEXT: s_cmp_eq_u32 s6, 2 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_cmp_eq_u32 s6, 3 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v8i16_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_lshr_b32 s6, s5, 1 +; GFX8-NEXT: s_cmp_eq_u32 s6, 1 +; GFX8-NEXT: s_mov_b32 s8, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_cselect_b32 s7, s1, s0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 2 +; GFX8-NEXT: s_cselect_b32 s7, s2, s7 +; GFX8-NEXT: s_cmp_eq_u32 s6, 3 +; GFX8-NEXT: s_cselect_b32 s7, s3, s7 +; GFX8-NEXT: s_and_b32 s5, s5, 1 +; GFX8-NEXT: s_lshl_b32 s5, s5, 4 +; GFX8-NEXT: s_and_b32 s4, s4, s8 +; GFX8-NEXT: s_lshl_b32 s4, s4, s5 +; GFX8-NEXT: s_lshl_b32 s5, s8, s5 +; GFX8-NEXT: s_andn2_b32 s5, s7, s5 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: s_cselect_b32 s0, s4, s0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 1 +; GFX8-NEXT: s_cselect_b32 s1, s4, s1 +; GFX8-NEXT: s_cmp_eq_u32 s6, 2 +; GFX8-NEXT: s_cselect_b32 s2, s4, s2 +; GFX8-NEXT: s_cmp_eq_u32 s6, 3 +; GFX8-NEXT: s_cselect_b32 s3, s4, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v8i16_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_lshr_b32 s6, s5, 1 +; GFX7-NEXT: s_cmp_eq_u32 s6, 1 +; GFX7-NEXT: s_mov_b32 s8, 0xffff +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_cselect_b32 s7, s1, s0 +; GFX7-NEXT: s_cmp_eq_u32 s6, 2 +; GFX7-NEXT: s_cselect_b32 s7, s2, s7 +; GFX7-NEXT: s_cmp_eq_u32 s6, 3 +; GFX7-NEXT: s_cselect_b32 s7, s3, s7 +; GFX7-NEXT: s_and_b32 s5, s5, 1 +; GFX7-NEXT: s_lshl_b32 s5, s5, 4 +; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: s_lshl_b32 s4, s4, s5 +; GFX7-NEXT: s_lshl_b32 s5, s8, s5 +; GFX7-NEXT: s_andn2_b32 s5, s7, s5 +; GFX7-NEXT: s_or_b32 s4, s5, s4 +; GFX7-NEXT: s_cmp_eq_u32 s6, 0 +; GFX7-NEXT: s_cselect_b32 s0, s4, s0 +; GFX7-NEXT: s_cmp_eq_u32 s6, 1 +; GFX7-NEXT: s_cselect_b32 s1, s4, s1 +; GFX7-NEXT: s_cmp_eq_u32 s6, 2 +; GFX7-NEXT: s_cselect_b32 s2, s4, s2 +; GFX7-NEXT: s_cmp_eq_u32 s6, 3 +; GFX7-NEXT: s_cselect_b32 s3, s4, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx + store <8 x i16> %insert, <8 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v8i16_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_and_b32 s1, s3, 1 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s4, s3, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_and_b32 s2, s2, s0 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_not_b32 s5, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[2:3] +; GFX9-NEXT: v_and_or_b32 v4, v5, s5, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v8i16_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_and_b32 s1, s3, 1 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s4, s3, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_and_b32 s2, s2, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX8-NEXT: s_lshl_b32 s5, s2, s1 +; GFX8-NEXT: s_not_b32 s6, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_or_b32_e32 v4, s5, v4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v8i16_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_and_b32 s1, s3, 1 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshr_b32 s4, s3, 1 +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: s_and_b32 s2, s2, s0 +; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX7-NEXT: s_lshl_b32 s5, s2, s1 +; GFX7-NEXT: s_not_b32 s6, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i16>, <8 x i16> addrspace(1 )* %ptr + %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx + store <8 x i16> %insert, <8 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v8i16_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_lshr_b32 s5, s4, 1 +; GFX9-NEXT: s_cmp_eq_u32 s5, 1 +; GFX9-NEXT: s_mov_b32 s7, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cselect_b32 s6, s1, s0 +; GFX9-NEXT: s_cmp_eq_u32 s5, 2 +; GFX9-NEXT: s_cselect_b32 s6, s2, s6 +; GFX9-NEXT: s_cmp_eq_u32 s5, 3 +; GFX9-NEXT: s_cselect_b32 s6, s3, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 1 +; GFX9-NEXT: s_lshl_b32 s4, s4, 4 +; GFX9-NEXT: s_lshl_b32 s7, s7, s4 +; GFX9-NEXT: s_andn2_b32 s6, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v8i16_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_lshr_b32 s5, s4, 1 +; GFX8-NEXT: s_cmp_eq_u32 s5, 1 +; GFX8-NEXT: s_mov_b32 s7, 0xffff +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_cselect_b32 s6, s1, s0 +; GFX8-NEXT: s_cmp_eq_u32 s5, 2 +; GFX8-NEXT: s_cselect_b32 s6, s2, s6 +; GFX8-NEXT: s_cmp_eq_u32 s5, 3 +; GFX8-NEXT: s_cselect_b32 s6, s3, s6 +; GFX8-NEXT: s_and_b32 s4, s4, 1 +; GFX8-NEXT: s_lshl_b32 s4, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_lshl_b32 s4, s7, s4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: s_andn2_b32 s4, s6, s4 +; GFX8-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v8i16_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_lshr_b32 s5, s4, 1 +; GFX7-NEXT: s_cmp_eq_u32 s5, 1 +; GFX7-NEXT: s_mov_b32 s7, 0xffff +; GFX7-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_cselect_b32 s6, s1, s0 +; GFX7-NEXT: s_cmp_eq_u32 s5, 2 +; GFX7-NEXT: s_cselect_b32 s6, s2, s6 +; GFX7-NEXT: s_cmp_eq_u32 s5, 3 +; GFX7-NEXT: s_cselect_b32 s6, s3, s6 +; GFX7-NEXT: s_and_b32 s4, s4, 1 +; GFX7-NEXT: s_lshl_b32 s4, s4, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX7-NEXT: s_lshl_b32 s4, s7, s4 +; GFX7-NEXT: s_andn2_b32 s4, s6, s4 +; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx + store <8 x i16> %insert, <8 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v8i16_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-NEXT: s_and_b32 s4, s4, s5 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_and_or_b32 v5, v1, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v8i16_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-NEXT: s_and_b32 s4, s4, s5 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v5, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v8i16_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: s_mov_b32 s5, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX7-NEXT: s_and_b32 s4, s4, s5 +; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx + store <8 x i16> %insert, <8 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v8i16_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_and_or_b32 v5, v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v8i16_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: s_mov_b32 s8, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v8i16_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_mov_b32 s8, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr + %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx + store <8 x i16> %insert, <8 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v8i16_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v7, v7, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v8i16_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX8-NEXT: v_or_b32_e32 v7, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v8i16_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX7-NEXT: v_or_b32_e32 v7, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx + store <8 x i16> %insert, <8 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v8i16_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_lshr_b32 s4, s2, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_not_b32 s5, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v7, v1, s5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v8i16_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_lshr_b32 s4, s2, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_not_b32 s5, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX8-NEXT: v_or_b32_e32 v7, v1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v8i16_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_lshr_b32 s4, s2, 1 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_or_b32_e32 v7, v1, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v7, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v7, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v7, s[2:3] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx + store <8 x i16> %insert, <8 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v8i16_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v8i16_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v8i16_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v3, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v3, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx + store <8 x i16> %insert, <8 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v16i16_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX9-NEXT: s_lshr_b32 s7, s5, 1 +; GFX9-NEXT: s_cmp_eq_u32 s7, 1 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cselect_b32 s0, s9, s8 +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 +; GFX9-NEXT: s_cselect_b32 s0, s10, s0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 3 +; GFX9-NEXT: s_cselect_b32 s0, s11, s0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 +; GFX9-NEXT: s_cselect_b32 s0, s12, s0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 5 +; GFX9-NEXT: s_cselect_b32 s0, s13, s0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: s_cselect_b32 s0, s14, s0 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_cselect_b32 s0, s15, s0 +; GFX9-NEXT: s_and_b32 s1, s5, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_and_b32 s3, s4, s2 +; GFX9-NEXT: s_lshl_b32 s3, s3, s1 +; GFX9-NEXT: s_lshl_b32 s1, s2, s1 +; GFX9-NEXT: s_andn2_b32 s0, s0, s1 +; GFX9-NEXT: s_or_b32 s16, s0, s3 +; GFX9-NEXT: s_cmp_eq_u32 s7, 0 +; GFX9-NEXT: s_cselect_b32 s0, s16, s8 +; GFX9-NEXT: s_cmp_eq_u32 s7, 1 +; GFX9-NEXT: s_cselect_b32 s1, s16, s9 +; GFX9-NEXT: s_cmp_eq_u32 s7, 2 +; GFX9-NEXT: s_cselect_b32 s2, s16, s10 +; GFX9-NEXT: s_cmp_eq_u32 s7, 3 +; GFX9-NEXT: s_cselect_b32 s3, s16, s11 +; GFX9-NEXT: s_cmp_eq_u32 s7, 4 +; GFX9-NEXT: s_cselect_b32 s4, s16, s12 +; GFX9-NEXT: s_cmp_eq_u32 s7, 5 +; GFX9-NEXT: s_cselect_b32 s5, s16, s13 +; GFX9-NEXT: s_cmp_eq_u32 s7, 6 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_cselect_b32 s6, s16, s14 +; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_cselect_b32 s7, s16, s15 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 +; GFX9-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v16i16_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX8-NEXT: s_lshr_b32 s7, s5, 1 +; GFX8-NEXT: s_cmp_eq_u32 s7, 1 +; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_cselect_b32 s0, s9, s8 +; GFX8-NEXT: s_cmp_eq_u32 s7, 2 +; GFX8-NEXT: s_cselect_b32 s0, s10, s0 +; GFX8-NEXT: s_cmp_eq_u32 s7, 3 +; GFX8-NEXT: s_cselect_b32 s0, s11, s0 +; GFX8-NEXT: s_cmp_eq_u32 s7, 4 +; GFX8-NEXT: s_cselect_b32 s0, s12, s0 +; GFX8-NEXT: s_cmp_eq_u32 s7, 5 +; GFX8-NEXT: s_cselect_b32 s0, s13, s0 +; GFX8-NEXT: s_cmp_eq_u32 s7, 6 +; GFX8-NEXT: s_cselect_b32 s0, s14, s0 +; GFX8-NEXT: s_cmp_eq_u32 s7, 7 +; GFX8-NEXT: s_cselect_b32 s0, s15, s0 +; GFX8-NEXT: s_and_b32 s1, s5, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_and_b32 s3, s4, s2 +; GFX8-NEXT: s_lshl_b32 s3, s3, s1 +; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_andn2_b32 s0, s0, s1 +; GFX8-NEXT: s_or_b32 s16, s0, s3 +; GFX8-NEXT: s_cmp_eq_u32 s7, 0 +; GFX8-NEXT: s_cselect_b32 s0, s16, s8 +; GFX8-NEXT: s_cmp_eq_u32 s7, 1 +; GFX8-NEXT: s_cselect_b32 s1, s16, s9 +; GFX8-NEXT: s_cmp_eq_u32 s7, 2 +; GFX8-NEXT: s_cselect_b32 s2, s16, s10 +; GFX8-NEXT: s_cmp_eq_u32 s7, 3 +; GFX8-NEXT: s_cselect_b32 s3, s16, s11 +; GFX8-NEXT: s_cmp_eq_u32 s7, 4 +; GFX8-NEXT: s_cselect_b32 s4, s16, s12 +; GFX8-NEXT: s_cmp_eq_u32 s7, 5 +; GFX8-NEXT: s_cselect_b32 s5, s16, s13 +; GFX8-NEXT: s_cmp_eq_u32 s7, 6 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_cselect_b32 s6, s16, s14 +; GFX8-NEXT: s_cmp_eq_u32 s7, 7 +; GFX8-NEXT: s_cselect_b32 s7, s16, s15 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: v_mov_b32_e32 v7, s7 +; GFX8-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v16i16_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX7-NEXT: s_lshr_b32 s7, s5, 1 +; GFX7-NEXT: s_cmp_eq_u32 s7, 1 +; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_cselect_b32 s0, s9, s8 +; GFX7-NEXT: s_cmp_eq_u32 s7, 2 +; GFX7-NEXT: s_cselect_b32 s0, s10, s0 +; GFX7-NEXT: s_cmp_eq_u32 s7, 3 +; GFX7-NEXT: s_cselect_b32 s0, s11, s0 +; GFX7-NEXT: s_cmp_eq_u32 s7, 4 +; GFX7-NEXT: s_cselect_b32 s0, s12, s0 +; GFX7-NEXT: s_cmp_eq_u32 s7, 5 +; GFX7-NEXT: s_cselect_b32 s0, s13, s0 +; GFX7-NEXT: s_cmp_eq_u32 s7, 6 +; GFX7-NEXT: s_cselect_b32 s0, s14, s0 +; GFX7-NEXT: s_cmp_eq_u32 s7, 7 +; GFX7-NEXT: s_cselect_b32 s0, s15, s0 +; GFX7-NEXT: s_and_b32 s1, s5, 1 +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: s_and_b32 s3, s4, s2 +; GFX7-NEXT: s_lshl_b32 s3, s3, s1 +; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_andn2_b32 s0, s0, s1 +; GFX7-NEXT: s_or_b32 s16, s0, s3 +; GFX7-NEXT: s_cmp_eq_u32 s7, 0 +; GFX7-NEXT: s_cselect_b32 s0, s16, s8 +; GFX7-NEXT: s_cmp_eq_u32 s7, 1 +; GFX7-NEXT: s_cselect_b32 s1, s16, s9 +; GFX7-NEXT: s_cmp_eq_u32 s7, 2 +; GFX7-NEXT: s_cselect_b32 s2, s16, s10 +; GFX7-NEXT: s_cmp_eq_u32 s7, 3 +; GFX7-NEXT: s_cselect_b32 s3, s16, s11 +; GFX7-NEXT: s_cmp_eq_u32 s7, 4 +; GFX7-NEXT: s_cselect_b32 s4, s16, s12 +; GFX7-NEXT: s_cmp_eq_u32 s7, 5 +; GFX7-NEXT: s_cselect_b32 s5, s16, s13 +; GFX7-NEXT: s_cmp_eq_u32 s7, 6 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_cselect_b32 s6, s16, s14 +; GFX7-NEXT: s_cmp_eq_u32 s7, 7 +; GFX7-NEXT: s_cselect_b32 s7, s16, s15 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: v_mov_b32_e32 v6, s6 +; GFX7-NEXT: v_mov_b32_e32 v7, s7 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr + %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx + store <16 x i16> %insert, <16 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v16i16_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX9-NEXT: s_and_b32 s1, s3, 1 +; GFX9-NEXT: s_lshr_b32 s12, s3, 1 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_and_b32 s2, s2, s0 +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_not_b32 s13, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] +; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] +; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v16i16_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GFX8-NEXT: s_and_b32 s1, s3, 1 +; GFX8-NEXT: s_lshr_b32 s12, s3, 1 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_and_b32 s2, s2, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 +; GFX8-NEXT: s_lshl_b32 s13, s2, s1 +; GFX8-NEXT: s_not_b32 s14, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v3, s[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v4, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v7, s[10:11] +; GFX8-NEXT: v_and_b32_e32 v8, s14, v8 +; GFX8-NEXT: v_or_b32_e32 v8, s13, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] +; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v16i16_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s18, 0 +; GFX7-NEXT: s_mov_b32 s19, 0xf000 +; GFX7-NEXT: s_mov_b64 s[16:17], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[16:19], 0 addr64 offset:16 +; GFX7-NEXT: s_and_b32 s1, s3, 1 +; GFX7-NEXT: s_lshr_b32 s12, s3, 1 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: s_and_b32 s2, s2, s0 +; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 +; GFX7-NEXT: s_lshl_b32 s13, s2, s1 +; GFX7-NEXT: s_not_b32 s14, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[10:11] +; GFX7-NEXT: v_and_b32_e32 v0, s14, v0 +; GFX7-NEXT: v_or_b32_e32 v10, s13, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr + %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx + store <16 x i16> %insert, <16 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v16i16_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX9-NEXT: s_lshr_b32 s2, s4, 1 +; GFX9-NEXT: s_cmp_eq_u32 s2, 1 +; GFX9-NEXT: s_mov_b32 s3, 0xffff +; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_cselect_b32 s0, s9, s8 +; GFX9-NEXT: s_cmp_eq_u32 s2, 2 +; GFX9-NEXT: s_cselect_b32 s0, s10, s0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 3 +; GFX9-NEXT: s_cselect_b32 s0, s11, s0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 4 +; GFX9-NEXT: s_cselect_b32 s0, s12, s0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 5 +; GFX9-NEXT: s_cselect_b32 s0, s13, s0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 6 +; GFX9-NEXT: s_cselect_b32 s0, s14, s0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 7 +; GFX9-NEXT: s_cselect_b32 s0, s15, s0 +; GFX9-NEXT: s_and_b32 s1, s4, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s3, s3, s1 +; GFX9-NEXT: s_andn2_b32 s0, s0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_lshl_or_b32 v8, v0, s1, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 +; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v7, s15 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v16i16_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX8-NEXT: s_lshr_b32 s2, s4, 1 +; GFX8-NEXT: s_cmp_eq_u32 s2, 1 +; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_cselect_b32 s0, s9, s8 +; GFX8-NEXT: s_cmp_eq_u32 s2, 2 +; GFX8-NEXT: s_cselect_b32 s0, s10, s0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 3 +; GFX8-NEXT: s_cselect_b32 s0, s11, s0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 4 +; GFX8-NEXT: s_cselect_b32 s0, s12, s0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 5 +; GFX8-NEXT: s_cselect_b32 s0, s13, s0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 6 +; GFX8-NEXT: s_cselect_b32 s0, s14, s0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 7 +; GFX8-NEXT: s_cselect_b32 s0, s15, s0 +; GFX8-NEXT: s_and_b32 s1, s4, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s1, s3, s1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: s_andn2_b32 s0, s0, s1 +; GFX8-NEXT: v_or_b32_e32 v8, s0, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 +; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v7, s15 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v16i16_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 +; GFX7-NEXT: s_lshr_b32 s2, s4, 1 +; GFX7-NEXT: s_cmp_eq_u32 s2, 1 +; GFX7-NEXT: s_mov_b32 s3, 0xffff +; GFX7-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_cselect_b32 s0, s9, s8 +; GFX7-NEXT: s_cmp_eq_u32 s2, 2 +; GFX7-NEXT: s_cselect_b32 s0, s10, s0 +; GFX7-NEXT: s_cmp_eq_u32 s2, 3 +; GFX7-NEXT: s_cselect_b32 s0, s11, s0 +; GFX7-NEXT: s_cmp_eq_u32 s2, 4 +; GFX7-NEXT: s_cselect_b32 s0, s12, s0 +; GFX7-NEXT: s_cmp_eq_u32 s2, 5 +; GFX7-NEXT: s_cselect_b32 s0, s13, s0 +; GFX7-NEXT: s_cmp_eq_u32 s2, 6 +; GFX7-NEXT: s_cselect_b32 s0, s14, s0 +; GFX7-NEXT: s_cmp_eq_u32 s2, 7 +; GFX7-NEXT: s_cselect_b32 s0, s15, s0 +; GFX7-NEXT: s_and_b32 s1, s4, 1 +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_lshl_b32 s1, s3, s1 +; GFX7-NEXT: s_andn2_b32 s0, s0, s1 +; GFX7-NEXT: v_or_b32_e32 v8, s0, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 2 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX7-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 3 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc +; GFX7-NEXT: v_mov_b32_e32 v5, s13 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 5 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX7-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, 4 +; GFX7-NEXT: v_mov_b32_e32 v6, s14 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 6 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 7 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr + %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx + store <16 x i16> %insert, <16 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v16i16_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s18 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s19 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v5, s20 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v6, s21 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 +; GFX9-NEXT: v_mov_b32_e32 v7, s22 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX9-NEXT: s_and_b32 s4, s4, s5 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v9, s23 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_and_or_b32 v9, v1, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s16 +; GFX9-NEXT: v_mov_b32_e32 v2, s18 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v3, s19 +; GFX9-NEXT: v_mov_b32_e32 v4, s20 +; GFX9-NEXT: v_mov_b32_e32 v5, s21 +; GFX9-NEXT: v_mov_b32_e32 v6, s22 +; GFX9-NEXT: v_mov_b32_e32 v7, s23 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v16i16_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s16 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s19 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v5, s20 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: v_mov_b32_e32 v6, s21 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 +; GFX8-NEXT: v_mov_b32_e32 v7, s22 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX8-NEXT: s_and_b32 s4, s4, s5 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v9, s23 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v9, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: v_mov_b32_e32 v2, s18 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: v_mov_b32_e32 v3, s19 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v5, s21 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: v_mov_b32_e32 v7, s23 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v16i16_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx8 s[16:23], s[2:3], 0x0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 1, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s16 +; GFX7-NEXT: v_mov_b32_e32 v2, s17 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v4, s19 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v5, s20 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[14:15], 4, v8 +; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7-NEXT: s_mov_b32 s5, 0xffff +; GFX7-NEXT: v_mov_b32_e32 v6, s21 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 +; GFX7-NEXT: v_mov_b32_e32 v7, s22 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX7-NEXT: s_and_b32 s4, s4, s5 +; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_mov_b32_e32 v9, s23 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] +; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v9, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: v_mov_b32_e32 v2, s18 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX7-NEXT: v_mov_b32_e32 v4, s20 +; GFX7-NEXT: v_mov_b32_e32 v5, s21 +; GFX7-NEXT: v_mov_b32_e32 v6, s22 +; GFX7-NEXT: v_mov_b32_e32 v7, s23 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr + %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx + store <16 x i16> %insert, <16 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v16i16_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: v_mov_b32_e32 v3, s13 +; GFX9-NEXT: v_mov_b32_e32 v4, s14 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s16 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, s17 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_mov_b32_e32 v9, s18 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 +; GFX9-NEXT: s_mov_b32 s20, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s20 +; GFX9-NEXT: v_mov_b32_e32 v10, s19 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_and_or_b32 v9, v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s14 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s15 +; GFX9-NEXT: v_mov_b32_e32 v4, s16 +; GFX9-NEXT: v_mov_b32_e32 v5, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s18 +; GFX9-NEXT: v_mov_b32_e32 v7, s19 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v16i16_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NEXT: v_mov_b32_e32 v4, s14 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v6, s16 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, s17 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, s18 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 +; GFX8-NEXT: s_mov_b32 s20, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s20 +; GFX8-NEXT: v_mov_b32_e32 v10, s19 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v9, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s17 +; GFX8-NEXT: v_mov_b32_e32 v6, s18 +; GFX8-NEXT: v_mov_b32_e32 v7, s19 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v16i16_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX7-NEXT: v_mov_b32_e32 v5, s15 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v8 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: s_mov_b32 s20, 0xffff +; GFX7-NEXT: v_mov_b32_e32 v7, s17 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 +; GFX7-NEXT: v_mov_b32_e32 v9, s18 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[6:7] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s20, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s20, v1 +; GFX7-NEXT: v_mov_b32_e32 v10, s19 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[8:9] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v9, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-NEXT: v_mov_b32_e32 v4, s16 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: v_mov_b32_e32 v7, s19 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr + %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx + store <16 x i16> %insert, <16 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v16i16_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] +; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v16i16_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] +; GFX8-NEXT: v_and_b32_e32 v1, v11, v1 +; GFX8-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v16i16_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s18, 0 +; GFX7-NEXT: s_mov_b32 s19, 0xf000 +; GFX7-NEXT: s_mov_b64 s[16:17], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] +; GFX7-NEXT: v_and_b32_e32 v1, v11, v1 +; GFX7-NEXT: v_or_b32_e32 v11, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr + %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx + store <16 x i16> %insert, <16 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v16i16_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_lshr_b32 s12, s2, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_not_b32 s13, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] +; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v16i16_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_add_u32_e32 v7, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[7:8] +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_lshr_b32 s12, s2, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: s_not_b32 s13, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] +; GFX8-NEXT: v_and_b32_e32 v1, s13, v1 +; GFX8-NEXT: v_or_b32_e32 v11, v1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v16i16_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s18, 0 +; GFX7-NEXT: s_mov_b32 s19, 0xf000 +; GFX7-NEXT: s_mov_b64 s[16:17], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 +; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_lshr_b32 s12, s2, 1 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_not_b32 s13, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] +; GFX7-NEXT: v_and_b32_e32 v1, s13, v1 +; GFX7-NEXT: v_or_b32_e32 v11, v1, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr + %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx + store <16 x i16> %insert, <16 x i16> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v16i16_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] +; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] +; GFX9-NEXT: s_add_u32 s0, 0, 16 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_addc_u32 s1, 0, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v11, s1 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, s0 +; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v16i16_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 16, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-NEXT: flat_load_dwordx4 v[8:11], v[8:9] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] +; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v12, v1, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] +; GFX8-NEXT: s_add_u32 s0, 0, 16 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: s_addc_u32 s1, 0, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, s0 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v16i16_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s18, 0 +; GFX7-NEXT: s_mov_b32 s19, 0xf000 +; GFX7-NEXT: s_mov_b64 s[16:17], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64 +; GFX7-NEXT: buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 +; GFX7-NEXT: s_mov_b32 s18, -1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] +; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v12, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr + %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx + store <16 x i16> %insert, <16 x i16> addrspace(1)* null + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -0,0 +1,5909 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s + +define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v2i8_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v2i8_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v2i8_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xff +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr + %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx + store <2 x i8> %insert, <2 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v2i8_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v2i8_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v2i8_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i8>, <2 x i8> addrspace(1 )* %ptr + %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx + store <2 x i8> %insert, <2 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v2i8_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: global_load_ushort v1, v[1:2], off +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v2i8_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: flat_load_ushort v1, v[1:2] +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v2i8_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xff +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr + %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx + store <2 x i8> %insert, <2 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v2i8_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: global_load_ushort v1, v[1:2], off +; GFX9-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v2i8_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: flat_load_ushort v1, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, s4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v2i8_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v1, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr + %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx + store <2 x i8> %insert, <2 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v2i8_v_v(<2 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v2i8_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_load_ushort v2, v[2:3], off +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v2i8_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_load_ushort v2, v[2:3] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v2i8_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s0, s2 +; GFX7-NEXT: s_mov_b32 s1, s3 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v1, v2, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i8>, <2 x i8> addrspace(4)* %ptr + %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx + store <2 x i8> %insert, <2 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v2i8_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v2i8_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v2i8_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: v_mov_b32_e32 v3, s2 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr + %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx + store <2 x i8> %insert, <2 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v2i8_v_s(<2 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v2i8_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v2i8_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v2i8_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr + %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx + store <2 x i8> %insert, <2 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v2i8_v_v(<2 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v2i8_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_short v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v2i8_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_short v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v2i8_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <2 x i8>, <2 x i8> addrspace(1)* %ptr + %insert = insertelement <2 x i8> %vec, i8 %val, i32 %idx + store <2 x i8> %insert, <2 x i8> addrspace(1)* null + ret void +} + +; FIXME: 3 element load/store legalization +; define amdgpu_ps void @insertelement_s_v3i8_s_s(<3 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) { +; %vec = load <3 x i8>, <3 x i8> addrspace(4)* %ptr +; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx +; store <3 x i8> %insert, <3 x i8> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_v_v3i8_s_s(<3 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) { +; %vec = load <3 x i8>, <3 x i8> addrspace(1 )* %ptr +; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx +; store <3 x i8> %insert, <3 x i8> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_s_v3i8_v_s(<3 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) { +; %vec = load <3 x i8>, <3 x i8> addrspace(4)* %ptr +; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx +; store <3 x i8> %insert, <3 x i8> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_s_v3i8_s_v(<3 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) { +; %vec = load <3 x i8>, <3 x i8> addrspace(4)* %ptr +; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx +; store <3 x i8> %insert, <3 x i8> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_s_v3i8_v_v(<3 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) { +; %vec = load <3 x i8>, <3 x i8> addrspace(4)* %ptr +; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx +; store <3 x i8> %insert, <3 x i8> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_v_v3i8_s_v(<3 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) { +; %vec = load <3 x i8>, <3 x i8> addrspace(1)* %ptr +; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx +; store <3 x i8> %insert, <3 x i8> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_v_v3i8_v_s(<3 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) { +; %vec = load <3 x i8>, <3 x i8> addrspace(1)* %ptr +; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx +; store <3 x i8> %insert, <3 x i8> addrspace(1)* null +; ret void +; } + +; define amdgpu_ps void @insertelement_v_v3i8_v_v(<3 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) { +; %vec = load <3 x i8>, <3 x i8> addrspace(1)* %ptr +; %insert = insertelement <3 x i8> %vec, i8 %val, i32 %idx +; store <3 x i8> %insert, <3 x i8> addrspace(1)* null +; ret void +; } + +define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v4i8_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: s_and_b32 s3, s3, 3 +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: s_and_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s3, s3, 3 +; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s3, s1, s3 +; GFX9-NEXT: s_not_b32 s3, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX9-NEXT: v_or3_b32 v0, v0, v4, v5 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX9-NEXT: v_or3_b32 v2, v0, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v4i8_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: s_and_b32 s1, s3, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_and_b32 s2, s2, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v5, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v4i8_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_and_b32 s1, s3, 3 +; GFX7-NEXT: s_and_b32 s2, s2, s0 +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, s0, s1 +; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm + %vec = load <4 x i8>, <4 x i8> addrspace(1 )* %ptr + %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx + store <4 x i8> %insert, <4 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v4i8_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: s_and_b32 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s3, s6 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s5, s6 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s4, 3 +; GFX9-NEXT: s_lshl_b32 s2, s2, 3 +; GFX9-NEXT: s_lshl_b32 s3, s6, s2 +; GFX9-NEXT: s_andn2_b32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, s2, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v1, v0, s6, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v2, v1, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v4i8_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s5, 0xff +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s1, s0, 8 +; GFX8-NEXT: s_and_b32 s1, s1, s5 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshr_b32 s3, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s3, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s4, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s1, s5, s1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: s_andn2_b32 s0, s0, s1 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v4i8_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: s_and_b32 s1, s1, s5 +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s5 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, s5 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s3, s5 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s4, 3 +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_lshl_b32 s1, s5, s1 +; GFX7-NEXT: s_andn2_b32 s0, s0, s1 +; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx + store <4 x i8> %insert, <4 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v4i8_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: s_and_b32 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-NEXT: s_lshr_b32 s5, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s3, s6 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s5, s6 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s4, s6 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v0, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s6 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_and_or_b32 v0, s1, v0, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v1, v0, s6, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v2, v1, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v4i8_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s5, 0xff +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s1, s0, 8 +; GFX8-NEXT: s_and_b32 s1, s1, s5 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshr_b32 s3, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s3, s5 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s4, s5 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v4i8_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: s_and_b32 s1, s1, s5 +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s5 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, s5 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s3, s5 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s4, s5 +; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s5, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s5, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx + store <4 x i8> %insert, <4 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v4i8_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s5, 0xff +; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s2, s1, 8 +; GFX9-NEXT: s_and_b32 s2, s2, s5 +; GFX9-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-NEXT: s_lshr_b32 s4, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s5 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s3, s5 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s4, s5 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s5 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_and_or_b32 v0, s1, v1, v0 +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v1, v0, s5, v1 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v2, v1, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v4i8_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s1, s0, 8 +; GFX8-NEXT: s_and_b32 s1, s1, s4 +; GFX8-NEXT: s_lshr_b32 s2, s0, 16 +; GFX8-NEXT: s_lshr_b32 s3, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s3, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s4 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v4i8_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s4, 0xff +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s1, s0, 8 +; GFX7-NEXT: s_and_b32 s1, s1, s4 +; GFX7-NEXT: s_lshr_b32 s2, s0, 16 +; GFX7-NEXT: s_lshr_b32 s3, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, s4 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s3, s4 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s4, v1 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s4, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <4 x i8>, <4 x i8> addrspace(4)* %ptr + %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx + store <4 x i8> %insert, <4 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v4i8_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: s_and_b32 s2, s2, s1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e64 v3, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s1 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v4 +; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX9-NEXT: v_or3_b32 v2, v0, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v4i8_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_lshlrev_b32_e64 v5, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v4i8_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_and_b32_e32 v1, 3, v2 +; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm + %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx + store <4 x i8> %insert, <4 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v4i8_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: s_lshl_b32 s2, s2, 3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s2, s1, s2 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX9-NEXT: v_or3_b32 v0, v0, v4, v5 +; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX9-NEXT: v_or3_b32 v2, v0, v3, v4 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v4i8_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v4i8_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 +; GFX7-NEXT: s_and_b32 s1, s2, 3 +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, s1, v1 +; GFX7-NEXT: s_lshl_b32 s1, s0, s1 +; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: s_endpgm + %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx + store <4 x i8> %insert, <4 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v4i8_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s1 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 +; GFX9-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v5, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, v1, v2 +; GFX9-NEXT: v_or3_b32 v2, v0, v3, v5 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v4i8_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xff +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v4i8_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 +; GFX7-NEXT: s_movk_i32 s2, 0xff +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v3 +; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v5, s2, v5 +; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v6, s2, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_and_b32_e32 v3, v3, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + %insert = insertelement <4 x i8> %vec, i8 %val, i32 %idx + store <4 x i8> %insert, <4 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v8i8_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s10, 0xff +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_and_b32 s2, s2, s10 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s3, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s6, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s7, s10 +; GFX9-NEXT: s_lshr_b32 s8, s1, 16 +; GFX9-NEXT: s_lshr_b32 s9, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s8, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s9, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s5, 2 +; GFX9-NEXT: s_cmp_eq_u32 s2, 1 +; GFX9-NEXT: s_cselect_b32 s3, s1, s0 +; GFX9-NEXT: s_and_b32 s5, s5, 3 +; GFX9-NEXT: s_lshl_b32 s5, s5, 3 +; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_lshl_b32 s4, s4, s5 +; GFX9-NEXT: s_lshl_b32 s5, s10, s5 +; GFX9-NEXT: s_andn2_b32 s3, s3, s5 +; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s0, s3, s0 +; GFX9-NEXT: s_cmp_eq_u32 s2, 1 +; GFX9-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_and_b32 s2, s2, s10 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s3, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s4, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshr_b32 s5, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s5, s10 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_lshr_b32 s7, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s6, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s7, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v8i8_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s10, 0xff +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: s_and_b32 s2, s2, s10 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s6, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s3, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s6, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshr_b32 s7, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s7, s10 +; GFX8-NEXT: s_lshr_b32 s8, s1, 16 +; GFX8-NEXT: s_lshr_b32 s9, s1, 24 +; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s8, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s9, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_lshr_b32 s2, s5, 2 +; GFX8-NEXT: s_cmp_eq_u32 s2, 1 +; GFX8-NEXT: s_cselect_b32 s3, s1, s0 +; GFX8-NEXT: s_and_b32 s5, s5, 3 +; GFX8-NEXT: s_lshl_b32 s5, s5, 3 +; GFX8-NEXT: s_and_b32 s4, s4, s10 +; GFX8-NEXT: s_lshl_b32 s4, s4, s5 +; GFX8-NEXT: s_lshl_b32 s5, s10, s5 +; GFX8-NEXT: s_andn2_b32 s3, s3, s5 +; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_cselect_b32 s0, s3, s0 +; GFX8-NEXT: s_cmp_eq_u32 s2, 1 +; GFX8-NEXT: s_cselect_b32 s1, s3, s1 +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: s_and_b32 s2, s2, s10 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s3, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s4, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshr_b32 s5, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s5, s10 +; GFX8-NEXT: s_lshr_b32 s6, s1, 16 +; GFX8-NEXT: s_lshr_b32 s7, s1, 24 +; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s6, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s7, s10 +; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v8i8_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s10, 0xff +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s2, s0, 8 +; GFX7-NEXT: s_and_b32 s2, s2, s10 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: s_lshr_b32 s6, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s10 +; GFX7-NEXT: s_lshl_b32 s2, s2, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s3, s10 +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s6, s10 +; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshr_b32 s7, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s7, s10 +; GFX7-NEXT: s_lshr_b32 s8, s1, 16 +; GFX7-NEXT: s_lshr_b32 s9, s1, 24 +; GFX7-NEXT: s_and_b32 s1, s1, s10 +; GFX7-NEXT: s_lshl_b32 s2, s2, 8 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s2, s8, s10 +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s2, s9, s10 +; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_lshr_b32 s2, s5, 2 +; GFX7-NEXT: s_cmp_eq_u32 s2, 1 +; GFX7-NEXT: s_cselect_b32 s3, s1, s0 +; GFX7-NEXT: s_and_b32 s5, s5, 3 +; GFX7-NEXT: s_lshl_b32 s5, s5, 3 +; GFX7-NEXT: s_and_b32 s4, s4, s10 +; GFX7-NEXT: s_lshl_b32 s4, s4, s5 +; GFX7-NEXT: s_lshl_b32 s5, s10, s5 +; GFX7-NEXT: s_andn2_b32 s3, s3, s5 +; GFX7-NEXT: s_or_b32 s3, s3, s4 +; GFX7-NEXT: s_cmp_eq_u32 s2, 0 +; GFX7-NEXT: s_cselect_b32 s4, s3, s0 +; GFX7-NEXT: s_cmp_eq_u32 s2, 1 +; GFX7-NEXT: s_cselect_b32 s3, s3, s1 +; GFX7-NEXT: s_lshr_b32 s2, s4, 8 +; GFX7-NEXT: s_and_b32 s2, s2, s10 +; GFX7-NEXT: s_lshr_b32 s5, s4, 16 +; GFX7-NEXT: s_lshr_b32 s6, s4, 24 +; GFX7-NEXT: s_and_b32 s4, s4, s10 +; GFX7-NEXT: s_lshl_b32 s2, s2, 8 +; GFX7-NEXT: s_or_b32 s2, s4, s2 +; GFX7-NEXT: s_and_b32 s4, s5, s10 +; GFX7-NEXT: s_lshl_b32 s4, s4, 16 +; GFX7-NEXT: s_or_b32 s2, s2, s4 +; GFX7-NEXT: s_and_b32 s4, s6, s10 +; GFX7-NEXT: s_lshl_b32 s4, s4, 24 +; GFX7-NEXT: s_lshr_b32 s7, s3, 8 +; GFX7-NEXT: s_or_b32 s2, s2, s4 +; GFX7-NEXT: s_and_b32 s4, s7, s10 +; GFX7-NEXT: s_lshr_b32 s8, s3, 16 +; GFX7-NEXT: s_lshr_b32 s9, s3, 24 +; GFX7-NEXT: s_and_b32 s3, s3, s10 +; GFX7-NEXT: s_lshl_b32 s4, s4, 8 +; GFX7-NEXT: s_or_b32 s3, s3, s4 +; GFX7-NEXT: s_and_b32 s4, s8, s10 +; GFX7-NEXT: s_lshl_b32 s4, s4, 16 +; GFX7-NEXT: s_or_b32 s3, s3, s4 +; GFX7-NEXT: s_and_b32 s4, s9, s10 +; GFX7-NEXT: s_lshl_b32 s4, s4, 24 +; GFX7-NEXT: s_or_b32 s3, s3, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx + store <8 x i8> %insert, <8 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v8i8_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: s_lshr_b32 s1, s3, 2 +; GFX9-NEXT: s_and_b32 s3, s3, 3 +; GFX9-NEXT: s_and_b32 s2, s2, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 3 +; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX9-NEXT: s_not_b32 s3, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v1, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v8, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v3, v4, s3, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v3 +; GFX9-NEXT: v_and_b32_sdwa v7, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v8i8_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: s_lshr_b32 s1, s3, 2 +; GFX8-NEXT: s_and_b32 s3, s3, 3 +; GFX8-NEXT: s_lshl_b32 s3, s3, 3 +; GFX8-NEXT: s_and_b32 s2, s2, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v10, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v8i8_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s6, 0xff +; GFX7-NEXT: s_and_b32 s1, s3, 3 +; GFX7-NEXT: s_lshr_b32 s0, s3, 2 +; GFX7-NEXT: s_and_b32 s2, s2, s6 +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, s6, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i8>, <8 x i8> addrspace(1 )* %ptr + %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx + store <8 x i8> %insert, <8 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v8i8_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s10, 0xff +; GFX9-NEXT: v_and_b32_e32 v0, s10, v0 +; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_and_b32 s2, s2, s10 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s3, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s6, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s7, s10 +; GFX9-NEXT: s_lshr_b32 s8, s1, 16 +; GFX9-NEXT: s_lshr_b32 s9, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s8, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s9, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s4, 2 +; GFX9-NEXT: s_cmp_eq_u32 s2, 1 +; GFX9-NEXT: s_cselect_b32 s3, s1, s0 +; GFX9-NEXT: s_and_b32 s4, s4, 3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 3 +; GFX9-NEXT: s_lshl_b32 s6, s10, s4 +; GFX9-NEXT: s_andn2_b32 s3, s3, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_lshl_or_b32 v2, v0, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s10, v2 +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v2, v4, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v2, v1, s10, v2 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v8i8_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s9, 0xff +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: s_and_b32 s2, s2, s9 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s5, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s3, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s5, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshr_b32 s6, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s6, s9 +; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: s_lshr_b32 s8, s1, 24 +; GFX8-NEXT: s_and_b32 s1, s1, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s7, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s8, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_lshr_b32 s2, s4, 2 +; GFX8-NEXT: s_cmp_eq_u32 s2, 1 +; GFX8-NEXT: s_cselect_b32 s3, s1, s0 +; GFX8-NEXT: s_and_b32 s4, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, s4, 3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_lshl_b32 s4, s9, s4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: s_andn2_b32 s3, s3, s4 +; GFX8-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v8i8_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s9, 0xff +; GFX7-NEXT: v_and_b32_e32 v0, s9, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s2, s0, 8 +; GFX7-NEXT: s_and_b32 s2, s2, s9 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: s_lshr_b32 s5, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s3, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s5, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshr_b32 s6, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s6, s9 +; GFX7-NEXT: s_lshr_b32 s7, s1, 16 +; GFX7-NEXT: s_lshr_b32 s8, s1, 24 +; GFX7-NEXT: s_and_b32 s1, s1, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 8 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s2, s7, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s2, s8, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_lshr_b32 s2, s4, 2 +; GFX7-NEXT: s_cmp_eq_u32 s2, 1 +; GFX7-NEXT: s_cselect_b32 s3, s1, s0 +; GFX7-NEXT: s_and_b32 s4, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, s4, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX7-NEXT: s_lshl_b32 s4, s9, s4 +; GFX7-NEXT: s_andn2_b32 s3, s3, s4 +; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s9, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx + store <8 x i8> %insert, <8 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v8i8_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s10, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_and_b32 s2, s2, s10 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s3, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s6, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s7, s10 +; GFX9-NEXT: s_lshr_b32 s8, s1, 16 +; GFX9-NEXT: s_lshr_b32 s9, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s8, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s9, s10 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_and_b32 s2, s4, s10 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s10 +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s10, v2 +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v2, v4, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v2, v1, s10, v2 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s10 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v8i8_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s9, 0xff +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: s_and_b32 s2, s2, s9 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s5, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s3, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s5, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshr_b32 s6, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s6, s9 +; GFX8-NEXT: s_lshr_b32 s7, s1, 16 +; GFX8-NEXT: s_lshr_b32 s8, s1, 24 +; GFX8-NEXT: s_and_b32 s1, s1, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s7, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s8, s9 +; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_and_b32 s2, s4, s9 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s9 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v8i8_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s9, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s2, s0, 8 +; GFX7-NEXT: s_and_b32 s2, s2, s9 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: s_lshr_b32 s5, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s3, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s5, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshr_b32 s6, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s6, s9 +; GFX7-NEXT: s_lshr_b32 s7, s1, 16 +; GFX7-NEXT: s_lshr_b32 s8, s1, 24 +; GFX7-NEXT: s_and_b32 s1, s1, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 8 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s2, s7, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s2, s8, s9 +; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_and_b32 s2, s4, s9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s9, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s9, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx + store <8 x i8> %insert, <8 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v8i8_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s9, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s2, s0, 8 +; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s9 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s3, s9 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s5, s9 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_lshr_b32 s6, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s2, s6, s9 +; GFX9-NEXT: s_lshr_b32 s7, s1, 16 +; GFX9-NEXT: s_lshr_b32 s8, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s9 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s7, s9 +; GFX9-NEXT: s_lshl_b32 s2, s2, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s8, s9 +; GFX9-NEXT: s_lshl_b32 s2, s2, 24 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s9 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_and_or_b32 v3, v3, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX9-NEXT: s_mov_b32 s4, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s9, v2 +; GFX9-NEXT: v_and_b32_sdwa v4, v0, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v2, v4, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v2, v1, s9, v2 +; GFX9-NEXT: v_and_b32_sdwa v3, v1, s9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v1, v2, v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v8i8_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s8, 0xff +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s2, s0, 8 +; GFX8-NEXT: s_and_b32 s2, s2, s8 +; GFX8-NEXT: s_lshr_b32 s3, s0, 16 +; GFX8-NEXT: s_lshr_b32 s4, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s3, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s4, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_lshr_b32 s5, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s2, s5, s8 +; GFX8-NEXT: s_lshr_b32 s6, s1, 16 +; GFX8-NEXT: s_lshr_b32 s7, s1, 24 +; GFX8-NEXT: s_and_b32 s1, s1, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s6, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s7, s8 +; GFX8-NEXT: s_lshl_b32 s2, s2, 24 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v3, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v8i8_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s8, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s2, s0, 8 +; GFX7-NEXT: s_and_b32 s2, s2, s8 +; GFX7-NEXT: s_lshr_b32 s3, s0, 16 +; GFX7-NEXT: s_lshr_b32 s4, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s8 +; GFX7-NEXT: s_lshl_b32 s2, s2, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s3, s8 +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s4, s8 +; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_lshr_b32 s5, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s2, s5, s8 +; GFX7-NEXT: s_lshr_b32 s6, s1, 16 +; GFX7-NEXT: s_lshr_b32 s7, s1, 24 +; GFX7-NEXT: s_and_b32 s1, s1, s8 +; GFX7-NEXT: s_lshl_b32 s2, s2, 8 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s2, s6, s8 +; GFX7-NEXT: s_lshl_b32 s2, s2, 16 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: s_and_b32 s2, s7, s8 +; GFX7-NEXT: s_lshl_b32 s2, s2, 24 +; GFX7-NEXT: s_or_b32 s1, s1, s2 +; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s8, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_or_b32_e32 v3, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s8, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr + %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx + store <8 x i8> %insert, <8 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v8i8_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: s_and_b32 s1, s2, s3 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e64 v5, v2, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 +; GFX9-NEXT: v_and_b32_sdwa v10, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v11, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v7 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v9 +; GFX9-NEXT: v_or3_b32 v1, v1, v10, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v6, v2, v5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v2 +; GFX9-NEXT: v_and_b32_sdwa v7, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v8i8_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_mov_b32_e32 v5, s0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v10, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v11, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v12, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v13, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v8i8_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s3, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: s_and_b32 s0, s2, s3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, s3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v5, s3, v5 +; GFX7-NEXT: v_and_b32_e32 v8, s3, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v6, s3, v6 +; GFX7-NEXT: v_and_b32_e32 v9, s3, v9 +; GFX7-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_and_b32_e32 v7, s3, v7 +; GFX7-NEXT: v_and_b32_e32 v10, s3, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s3, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s3, v6 +; GFX7-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, s3, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx + store <8 x i8> %insert, <8 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v8i8_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: s_lshr_b32 s1, s2, 2 +; GFX9-NEXT: s_and_b32 s2, s2, 3 +; GFX9-NEXT: s_lshl_b32 s2, s2, 3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX9-NEXT: s_not_b32 s2, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v7, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v4 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v8, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v4, s2, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v5, v0, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v6, v0, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v2 +; GFX9-NEXT: v_and_b32_sdwa v7, v1, s3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s3 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_or3_b32 v0, v0, v5, v6 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v8i8_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_lshr_b32 s1, s2, 2 +; GFX8-NEXT: s_and_b32 s2, s2, 3 +; GFX8-NEXT: s_lshl_b32 s2, s2, 3 +; GFX8-NEXT: v_mov_b32_e32 v6, s2 +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_mov_b32_e32 v5, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v8, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v10, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, s0, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v6, v0, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v1, v5 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v8i8_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s3, 0xff +; GFX7-NEXT: s_and_b32 s1, s2, 3 +; GFX7-NEXT: s_lshr_b32 s0, s2, 2 +; GFX7-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 +; GFX7-NEXT: s_lshl_b32 s1, s3, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s3, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s3, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s3, v7 +; GFX7-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v5, s3, v5 +; GFX7-NEXT: v_and_b32_e32 v8, s3, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, s3, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s3, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s3, v6 +; GFX7-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, s3, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s3, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx + store <8 x i8> %insert, <8 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v8i8_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v4 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v9, v0, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v10, v0, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s1, v7 +; GFX9-NEXT: v_and_b32_sdwa v11, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v12, v1, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s1, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v9, v10 +; GFX9-NEXT: v_or3_b32 v1, v1, v11, v12 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v7, v3, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v6, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v2 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_or3_b32 v0, v0, v6, v7 +; GFX9-NEXT: v_or3_b32 v1, v1, v8, v9 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v8i8_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, s0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 2, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0xff +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v8 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v11, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v12, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v13, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v3, v5, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v5, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v0, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v4, v1, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v8i8_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_mov_b32 s7, 0xf000 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_mov_b32_e32 v4, 0xff +; GFX7-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v7, s0, v7 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v10 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX7-NEXT: v_and_b32_e32 v11, s0, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, v6, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v6, v6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v7, v7, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v5, v5, v4 +; GFX7-NEXT: v_and_b32_e32 v4, v8, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7-NEXT: s_endpgm + %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr + %insert = insertelement <8 x i8> %vec, i8 %val, i32 %idx + store <8 x i8> %insert, <8 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v16i8_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s18, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s6, s0, 8 +; GFX9-NEXT: s_and_b32 s6, s6, s18 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s7, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s8, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshr_b32 s9, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s9, s18 +; GFX9-NEXT: s_lshr_b32 s10, s1, 16 +; GFX9-NEXT: s_lshr_b32 s11, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s6, s10, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s6, s11, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshr_b32 s12, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s6, s12, s18 +; GFX9-NEXT: s_lshr_b32 s13, s2, 16 +; GFX9-NEXT: s_lshr_b32 s14, s2, 24 +; GFX9-NEXT: s_and_b32 s2, s2, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s6, s13, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s6, s14, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshr_b32 s15, s3, 8 +; GFX9-NEXT: s_or_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s6, s15, s18 +; GFX9-NEXT: s_lshr_b32 s16, s3, 16 +; GFX9-NEXT: s_lshr_b32 s17, s3, 24 +; GFX9-NEXT: s_and_b32 s3, s3, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s3, s3, s6 +; GFX9-NEXT: s_and_b32 s6, s16, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s3, s3, s6 +; GFX9-NEXT: s_and_b32 s6, s17, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_or_b32 s3, s3, s6 +; GFX9-NEXT: s_lshr_b32 s6, s5, 2 +; GFX9-NEXT: s_cmp_eq_u32 s6, 1 +; GFX9-NEXT: s_cselect_b32 s7, s1, s0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 2 +; GFX9-NEXT: s_cselect_b32 s7, s2, s7 +; GFX9-NEXT: s_cmp_eq_u32 s6, 3 +; GFX9-NEXT: s_cselect_b32 s7, s3, s7 +; GFX9-NEXT: s_and_b32 s5, s5, 3 +; GFX9-NEXT: s_lshl_b32 s5, s5, 3 +; GFX9-NEXT: s_and_b32 s4, s4, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, s5 +; GFX9-NEXT: s_lshl_b32 s5, s18, s5 +; GFX9-NEXT: s_andn2_b32 s5, s7, s5 +; GFX9-NEXT: s_or_b32 s4, s5, s4 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: s_cselect_b32 s0, s4, s0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 1 +; GFX9-NEXT: s_cselect_b32 s1, s4, s1 +; GFX9-NEXT: s_cmp_eq_u32 s6, 2 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_cmp_eq_u32 s6, 3 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_lshr_b32 s4, s0, 8 +; GFX9-NEXT: s_and_b32 s4, s4, s18 +; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s4 +; GFX9-NEXT: s_and_b32 s4, s5, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s4 +; GFX9-NEXT: s_and_b32 s4, s6, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s4 +; GFX9-NEXT: s_and_b32 s4, s7, s18 +; GFX9-NEXT: s_lshr_b32 s8, s1, 16 +; GFX9-NEXT: s_lshr_b32 s9, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s4 +; GFX9-NEXT: s_and_b32 s4, s8, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s4 +; GFX9-NEXT: s_and_b32 s4, s9, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshr_b32 s10, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s4 +; GFX9-NEXT: s_and_b32 s4, s10, s18 +; GFX9-NEXT: s_lshr_b32 s11, s2, 16 +; GFX9-NEXT: s_lshr_b32 s12, s2, 24 +; GFX9-NEXT: s_and_b32 s2, s2, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_or_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s4, s11, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s4, s12, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshr_b32 s13, s3, 8 +; GFX9-NEXT: s_or_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s4, s13, s18 +; GFX9-NEXT: s_lshr_b32 s14, s3, 16 +; GFX9-NEXT: s_lshr_b32 s15, s3, 24 +; GFX9-NEXT: s_and_b32 s3, s3, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_and_b32 s4, s14, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: s_and_b32 s4, s15, s18 +; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_or_b32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v16i8_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s18, 0xff +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s6, s0, 8 +; GFX8-NEXT: s_and_b32 s6, s6, s18 +; GFX8-NEXT: s_lshr_b32 s7, s0, 16 +; GFX8-NEXT: s_lshr_b32 s8, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s6, s7, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s6, s8, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshr_b32 s9, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s6, s9, s18 +; GFX8-NEXT: s_lshr_b32 s10, s1, 16 +; GFX8-NEXT: s_lshr_b32 s11, s1, 24 +; GFX8-NEXT: s_and_b32 s1, s1, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s6, s10, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s6, s11, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshr_b32 s12, s2, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s6, s12, s18 +; GFX8-NEXT: s_lshr_b32 s13, s2, 16 +; GFX8-NEXT: s_lshr_b32 s14, s2, 24 +; GFX8-NEXT: s_and_b32 s2, s2, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_and_b32 s6, s13, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_and_b32 s6, s14, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_lshr_b32 s15, s3, 8 +; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_and_b32 s6, s15, s18 +; GFX8-NEXT: s_lshr_b32 s16, s3, 16 +; GFX8-NEXT: s_lshr_b32 s17, s3, 24 +; GFX8-NEXT: s_and_b32 s3, s3, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_or_b32 s3, s3, s6 +; GFX8-NEXT: s_and_b32 s6, s16, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s6 +; GFX8-NEXT: s_and_b32 s6, s17, s18 +; GFX8-NEXT: s_lshl_b32 s6, s6, 24 +; GFX8-NEXT: s_or_b32 s3, s3, s6 +; GFX8-NEXT: s_lshr_b32 s6, s5, 2 +; GFX8-NEXT: s_cmp_eq_u32 s6, 1 +; GFX8-NEXT: s_cselect_b32 s7, s1, s0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 2 +; GFX8-NEXT: s_cselect_b32 s7, s2, s7 +; GFX8-NEXT: s_cmp_eq_u32 s6, 3 +; GFX8-NEXT: s_cselect_b32 s7, s3, s7 +; GFX8-NEXT: s_and_b32 s5, s5, 3 +; GFX8-NEXT: s_lshl_b32 s5, s5, 3 +; GFX8-NEXT: s_and_b32 s4, s4, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, s5 +; GFX8-NEXT: s_lshl_b32 s5, s18, s5 +; GFX8-NEXT: s_andn2_b32 s5, s7, s5 +; GFX8-NEXT: s_or_b32 s4, s5, s4 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: s_cselect_b32 s0, s4, s0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 1 +; GFX8-NEXT: s_cselect_b32 s1, s4, s1 +; GFX8-NEXT: s_cmp_eq_u32 s6, 2 +; GFX8-NEXT: s_cselect_b32 s2, s4, s2 +; GFX8-NEXT: s_cmp_eq_u32 s6, 3 +; GFX8-NEXT: s_cselect_b32 s3, s4, s3 +; GFX8-NEXT: s_lshr_b32 s4, s0, 8 +; GFX8-NEXT: s_and_b32 s4, s4, s18 +; GFX8-NEXT: s_lshr_b32 s5, s0, 16 +; GFX8-NEXT: s_lshr_b32 s6, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s5, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s6, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshr_b32 s7, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s7, s18 +; GFX8-NEXT: s_lshr_b32 s8, s1, 16 +; GFX8-NEXT: s_lshr_b32 s9, s1, 24 +; GFX8-NEXT: s_and_b32 s1, s1, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, s8, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, s9, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshr_b32 s10, s2, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s4 +; GFX8-NEXT: s_and_b32 s4, s10, s18 +; GFX8-NEXT: s_lshr_b32 s11, s2, 16 +; GFX8-NEXT: s_lshr_b32 s12, s2, 24 +; GFX8-NEXT: s_and_b32 s2, s2, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 8 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s4, s11, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s4, s12, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshr_b32 s13, s3, 8 +; GFX8-NEXT: s_or_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s4, s13, s18 +; GFX8-NEXT: s_lshr_b32 s14, s3, 16 +; GFX8-NEXT: s_lshr_b32 s15, s3, 24 +; GFX8-NEXT: s_and_b32 s3, s3, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 8 +; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s4, s14, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s4, s15, s18 +; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_or_b32 s3, s3, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v16i8_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s18, 0xff +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s6, s0, 8 +; GFX7-NEXT: s_and_b32 s6, s6, s18 +; GFX7-NEXT: s_lshr_b32 s7, s0, 16 +; GFX7-NEXT: s_lshr_b32 s8, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s6 +; GFX7-NEXT: s_and_b32 s6, s7, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s6 +; GFX7-NEXT: s_and_b32 s6, s8, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshr_b32 s9, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s6 +; GFX7-NEXT: s_and_b32 s6, s9, s18 +; GFX7-NEXT: s_lshr_b32 s10, s1, 16 +; GFX7-NEXT: s_lshr_b32 s11, s1, 24 +; GFX7-NEXT: s_and_b32 s1, s1, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_or_b32 s1, s1, s6 +; GFX7-NEXT: s_and_b32 s6, s10, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 16 +; GFX7-NEXT: s_or_b32 s1, s1, s6 +; GFX7-NEXT: s_and_b32 s6, s11, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshr_b32 s12, s2, 8 +; GFX7-NEXT: s_or_b32 s1, s1, s6 +; GFX7-NEXT: s_and_b32 s6, s12, s18 +; GFX7-NEXT: s_lshr_b32 s13, s2, 16 +; GFX7-NEXT: s_lshr_b32 s14, s2, 24 +; GFX7-NEXT: s_and_b32 s2, s2, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_and_b32 s6, s13, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 16 +; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_and_b32 s6, s14, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshr_b32 s15, s3, 8 +; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_and_b32 s6, s15, s18 +; GFX7-NEXT: s_lshr_b32 s16, s3, 16 +; GFX7-NEXT: s_lshr_b32 s17, s3, 24 +; GFX7-NEXT: s_and_b32 s3, s3, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_or_b32 s3, s3, s6 +; GFX7-NEXT: s_and_b32 s6, s16, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 16 +; GFX7-NEXT: s_or_b32 s3, s3, s6 +; GFX7-NEXT: s_and_b32 s6, s17, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_or_b32 s3, s3, s6 +; GFX7-NEXT: s_lshr_b32 s6, s5, 2 +; GFX7-NEXT: s_cmp_eq_u32 s6, 1 +; GFX7-NEXT: s_cselect_b32 s7, s1, s0 +; GFX7-NEXT: s_cmp_eq_u32 s6, 2 +; GFX7-NEXT: s_cselect_b32 s7, s2, s7 +; GFX7-NEXT: s_cmp_eq_u32 s6, 3 +; GFX7-NEXT: s_cselect_b32 s7, s3, s7 +; GFX7-NEXT: s_and_b32 s5, s5, 3 +; GFX7-NEXT: s_lshl_b32 s5, s5, 3 +; GFX7-NEXT: s_and_b32 s4, s4, s18 +; GFX7-NEXT: s_lshl_b32 s4, s4, s5 +; GFX7-NEXT: s_lshl_b32 s5, s18, s5 +; GFX7-NEXT: s_andn2_b32 s5, s7, s5 +; GFX7-NEXT: s_or_b32 s4, s5, s4 +; GFX7-NEXT: s_cmp_eq_u32 s6, 0 +; GFX7-NEXT: s_cselect_b32 s5, s4, s0 +; GFX7-NEXT: s_cmp_eq_u32 s6, 1 +; GFX7-NEXT: s_cselect_b32 s7, s4, s1 +; GFX7-NEXT: s_cmp_eq_u32 s6, 2 +; GFX7-NEXT: s_cselect_b32 s2, s4, s2 +; GFX7-NEXT: s_cmp_eq_u32 s6, 3 +; GFX7-NEXT: s_cselect_b32 s3, s4, s3 +; GFX7-NEXT: s_lshr_b32 s4, s5, 8 +; GFX7-NEXT: s_and_b32 s4, s4, s18 +; GFX7-NEXT: s_lshr_b32 s6, s5, 16 +; GFX7-NEXT: s_lshr_b32 s8, s5, 24 +; GFX7-NEXT: s_and_b32 s5, s5, s18 +; GFX7-NEXT: s_lshl_b32 s4, s4, 8 +; GFX7-NEXT: s_or_b32 s4, s5, s4 +; GFX7-NEXT: s_and_b32 s5, s6, s18 +; GFX7-NEXT: s_lshl_b32 s5, s5, 16 +; GFX7-NEXT: s_lshr_b32 s9, s7, 8 +; GFX7-NEXT: s_or_b32 s4, s4, s5 +; GFX7-NEXT: s_and_b32 s5, s8, s18 +; GFX7-NEXT: s_and_b32 s6, s9, s18 +; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_or_b32 s4, s4, s5 +; GFX7-NEXT: s_lshr_b32 s10, s7, 16 +; GFX7-NEXT: s_and_b32 s5, s7, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_or_b32 s5, s5, s6 +; GFX7-NEXT: s_and_b32 s6, s10, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 16 +; GFX7-NEXT: s_lshr_b32 s11, s7, 24 +; GFX7-NEXT: s_or_b32 s5, s5, s6 +; GFX7-NEXT: s_and_b32 s6, s11, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshr_b32 s12, s2, 8 +; GFX7-NEXT: s_or_b32 s5, s5, s6 +; GFX7-NEXT: s_and_b32 s6, s12, s18 +; GFX7-NEXT: s_lshr_b32 s13, s2, 16 +; GFX7-NEXT: s_lshr_b32 s14, s2, 24 +; GFX7-NEXT: s_and_b32 s2, s2, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_and_b32 s6, s13, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 16 +; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_and_b32 s6, s14, s18 +; GFX7-NEXT: s_lshl_b32 s6, s6, 24 +; GFX7-NEXT: s_lshr_b32 s15, s3, 8 +; GFX7-NEXT: s_or_b32 s6, s2, s6 +; GFX7-NEXT: s_lshr_b32 s16, s3, 16 +; GFX7-NEXT: s_lshr_b32 s17, s3, 24 +; GFX7-NEXT: s_and_b32 s2, s3, s18 +; GFX7-NEXT: s_and_b32 s3, s15, s18 +; GFX7-NEXT: s_lshl_b32 s3, s3, 8 +; GFX7-NEXT: s_or_b32 s2, s2, s3 +; GFX7-NEXT: s_and_b32 s3, s16, s18 +; GFX7-NEXT: s_lshl_b32 s3, s3, 16 +; GFX7-NEXT: s_or_b32 s2, s2, s3 +; GFX7-NEXT: s_and_b32 s3, s17, s18 +; GFX7-NEXT: s_lshl_b32 s3, s3, 24 +; GFX7-NEXT: s_or_b32 s7, s2, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr + %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx + store <16 x i8> %insert, <16 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v16i8_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: s_and_b32 s1, s3, 3 +; GFX9-NEXT: s_lshr_b32 s4, s3, 2 +; GFX9-NEXT: s_and_b32 s2, s2, s6 +; GFX9-NEXT: s_lshl_b32 s1, s1, 3 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, s6, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX9-NEXT: s_not_b32 s5, s1 +; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX9-NEXT: v_and_b32_sdwa v10, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v11, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v6 +; GFX9-NEXT: v_and_b32_sdwa v12, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v13, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v14, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v11 +; GFX9-NEXT: v_or3_b32 v1, v1, v12, v13 +; GFX9-NEXT: v_and_b32_sdwa v16, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v9 +; GFX9-NEXT: v_or3_b32 v2, v2, v14, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX9-NEXT: v_or3_b32 v3, v3, v16, v17 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v3, s[2:3] +; GFX9-NEXT: v_and_or_b32 v5, v6, s5, v5 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v9, v0, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v10, v0, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v5 +; GFX9-NEXT: v_and_b32_sdwa v11, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v12, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v6 +; GFX9-NEXT: v_and_b32_sdwa v13, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v7 +; GFX9-NEXT: v_and_b32_sdwa v15, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v16, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_or3_b32 v0, v0, v9, v10 +; GFX9-NEXT: v_or3_b32 v1, v1, v11, v12 +; GFX9-NEXT: v_or3_b32 v2, v2, v13, v14 +; GFX9-NEXT: v_or3_b32 v3, v3, v15, v16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v16i8_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, s0 +; GFX8-NEXT: s_and_b32 s1, s3, 3 +; GFX8-NEXT: s_lshr_b32 s4, s3, 2 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_and_b32 s2, s2, s0 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX8-NEXT: s_lshl_b32 s5, s2, s1 +; GFX8-NEXT: s_not_b32 s6, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v11, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v12, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v13, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v1, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 +; GFX8-NEXT: v_and_b32_sdwa v15, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 +; GFX8-NEXT: v_and_b32_sdwa v17, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v18, v3, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v14 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_or_b32_e32 v3, v3, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX8-NEXT: v_or_b32_e32 v4, s5, v4 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v10, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v11, v0, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v12, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v13, v1, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v2, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v6, v3, v6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v14 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v16i8_s_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_movk_i32 s6, 0xff +; GFX7-NEXT: s_and_b32 s0, s3, 3 +; GFX7-NEXT: s_lshr_b32 s4, s3, 2 +; GFX7-NEXT: s_and_b32 s1, s2, s6 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, s6, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX7-NEXT: s_not_b32 s7, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX7-NEXT: v_and_b32_e32 v8, s6, v8 +; GFX7-NEXT: v_and_b32_e32 v10, s6, v10 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX7-NEXT: v_and_b32_e32 v9, s6, v9 +; GFX7-NEXT: v_and_b32_e32 v11, s6, v11 +; GFX7-NEXT: v_and_b32_e32 v13, s6, v13 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v12, s6, v12 +; GFX7-NEXT: v_and_b32_e32 v14, s6, v14 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX7-NEXT: v_and_b32_e32 v15, s6, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX7-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v4, s7, v4 +; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i8>, <16 x i8> addrspace(1 )* %ptr + %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx + store <16 x i8> %insert, <16 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_s_v16i8_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s18, 0xff +; GFX9-NEXT: v_and_b32_e32 v0, s18, v0 +; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s6, s0, 8 +; GFX9-NEXT: s_and_b32 s6, s6, s18 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s7, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s8, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshr_b32 s9, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s9, s18 +; GFX9-NEXT: s_lshr_b32 s10, s1, 16 +; GFX9-NEXT: s_lshr_b32 s11, s1, 24 +; GFX9-NEXT: s_and_b32 s1, s1, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s6, s10, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s6, s11, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshr_b32 s12, s2, 8 +; GFX9-NEXT: s_or_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s6, s12, s18 +; GFX9-NEXT: s_lshr_b32 s13, s2, 16 +; GFX9-NEXT: s_lshr_b32 s14, s2, 24 +; GFX9-NEXT: s_and_b32 s2, s2, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s6, s13, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s6, s14, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_lshr_b32 s15, s3, 8 +; GFX9-NEXT: s_or_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s6, s15, s18 +; GFX9-NEXT: s_lshr_b32 s16, s3, 16 +; GFX9-NEXT: s_lshr_b32 s17, s3, 24 +; GFX9-NEXT: s_and_b32 s3, s3, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_or_b32 s3, s3, s6 +; GFX9-NEXT: s_and_b32 s6, s16, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 16 +; GFX9-NEXT: s_or_b32 s3, s3, s6 +; GFX9-NEXT: s_and_b32 s6, s17, s18 +; GFX9-NEXT: s_lshl_b32 s6, s6, 24 +; GFX9-NEXT: s_or_b32 s3, s3, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 2 +; GFX9-NEXT: s_cmp_eq_u32 s6, 1 +; GFX9-NEXT: s_cselect_b32 s7, s1, s0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 2 +; GFX9-NEXT: s_cselect_b32 s7, s2, s7 +; GFX9-NEXT: s_cmp_eq_u32 s6, 3 +; GFX9-NEXT: s_cselect_b32 s7, s3, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 3 +; GFX9-NEXT: s_lshl_b32 s4, s4, 3 +; GFX9-NEXT: s_lshl_b32 s8, s18, s4 +; GFX9-NEXT: s_andn2_b32 s7, s7, s8 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s6, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s6, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s6, 2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s6, 3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v4, v0, s18, v4 +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v4, v8, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-NEXT: v_and_or_b32 v5, v1, s18, v5 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v1, v5, v8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v5, v2, s18, v5 +; GFX9-NEXT: v_and_b32_sdwa v6, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v2, v5, v6, v2 +; GFX9-NEXT: v_and_or_b32 v4, v3, s18, v4 +; GFX9-NEXT: v_and_b32_sdwa v5, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v3, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v3, v4, v5, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v16i8_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s17, 0xff +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s5, s0, 8 +; GFX8-NEXT: s_and_b32 s5, s5, s17 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s7, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: s_and_b32 s5, s6, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: s_and_b32 s5, s7, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshr_b32 s8, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: s_and_b32 s5, s8, s17 +; GFX8-NEXT: s_lshr_b32 s9, s1, 16 +; GFX8-NEXT: s_lshr_b32 s10, s1, 24 +; GFX8-NEXT: s_and_b32 s1, s1, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s5 +; GFX8-NEXT: s_and_b32 s5, s9, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 16 +; GFX8-NEXT: s_or_b32 s1, s1, s5 +; GFX8-NEXT: s_and_b32 s5, s10, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshr_b32 s11, s2, 8 +; GFX8-NEXT: s_or_b32 s1, s1, s5 +; GFX8-NEXT: s_and_b32 s5, s11, s17 +; GFX8-NEXT: s_lshr_b32 s12, s2, 16 +; GFX8-NEXT: s_lshr_b32 s13, s2, 24 +; GFX8-NEXT: s_and_b32 s2, s2, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_or_b32 s2, s2, s5 +; GFX8-NEXT: s_and_b32 s5, s12, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 16 +; GFX8-NEXT: s_or_b32 s2, s2, s5 +; GFX8-NEXT: s_and_b32 s5, s13, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshr_b32 s14, s3, 8 +; GFX8-NEXT: s_or_b32 s2, s2, s5 +; GFX8-NEXT: s_and_b32 s5, s14, s17 +; GFX8-NEXT: s_lshr_b32 s15, s3, 16 +; GFX8-NEXT: s_lshr_b32 s16, s3, 24 +; GFX8-NEXT: s_and_b32 s3, s3, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_or_b32 s3, s3, s5 +; GFX8-NEXT: s_and_b32 s5, s15, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 16 +; GFX8-NEXT: s_or_b32 s3, s3, s5 +; GFX8-NEXT: s_and_b32 s5, s16, s17 +; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_or_b32 s3, s3, s5 +; GFX8-NEXT: s_lshr_b32 s5, s4, 2 +; GFX8-NEXT: s_cmp_eq_u32 s5, 1 +; GFX8-NEXT: s_cselect_b32 s6, s1, s0 +; GFX8-NEXT: s_cmp_eq_u32 s5, 2 +; GFX8-NEXT: s_cselect_b32 s6, s2, s6 +; GFX8-NEXT: s_cmp_eq_u32 s5, 3 +; GFX8-NEXT: s_cselect_b32 s6, s3, s6 +; GFX8-NEXT: s_and_b32 s4, s4, 3 +; GFX8-NEXT: s_lshl_b32 s4, s4, 3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_lshl_b32 s4, s17, s4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: s_andn2_b32 s4, s6, s4 +; GFX8-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v8, s17 +; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v16i8_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s17, 0xff +; GFX7-NEXT: v_and_b32_e32 v0, s17, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s5, s0, 8 +; GFX7-NEXT: s_and_b32 s5, s5, s17 +; GFX7-NEXT: s_lshr_b32 s6, s0, 16 +; GFX7-NEXT: s_lshr_b32 s7, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s5 +; GFX7-NEXT: s_and_b32 s5, s6, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s5 +; GFX7-NEXT: s_and_b32 s5, s7, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshr_b32 s8, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s5 +; GFX7-NEXT: s_and_b32 s5, s8, s17 +; GFX7-NEXT: s_lshr_b32 s9, s1, 16 +; GFX7-NEXT: s_lshr_b32 s10, s1, 24 +; GFX7-NEXT: s_and_b32 s1, s1, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_or_b32 s1, s1, s5 +; GFX7-NEXT: s_and_b32 s5, s9, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 16 +; GFX7-NEXT: s_or_b32 s1, s1, s5 +; GFX7-NEXT: s_and_b32 s5, s10, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshr_b32 s11, s2, 8 +; GFX7-NEXT: s_or_b32 s1, s1, s5 +; GFX7-NEXT: s_and_b32 s5, s11, s17 +; GFX7-NEXT: s_lshr_b32 s12, s2, 16 +; GFX7-NEXT: s_lshr_b32 s13, s2, 24 +; GFX7-NEXT: s_and_b32 s2, s2, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_or_b32 s2, s2, s5 +; GFX7-NEXT: s_and_b32 s5, s12, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 16 +; GFX7-NEXT: s_or_b32 s2, s2, s5 +; GFX7-NEXT: s_and_b32 s5, s13, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshr_b32 s14, s3, 8 +; GFX7-NEXT: s_or_b32 s2, s2, s5 +; GFX7-NEXT: s_and_b32 s5, s14, s17 +; GFX7-NEXT: s_lshr_b32 s15, s3, 16 +; GFX7-NEXT: s_lshr_b32 s16, s3, 24 +; GFX7-NEXT: s_and_b32 s3, s3, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_or_b32 s3, s3, s5 +; GFX7-NEXT: s_and_b32 s5, s15, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 16 +; GFX7-NEXT: s_or_b32 s3, s3, s5 +; GFX7-NEXT: s_and_b32 s5, s16, s17 +; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_or_b32 s3, s3, s5 +; GFX7-NEXT: s_lshr_b32 s5, s4, 2 +; GFX7-NEXT: s_cmp_eq_u32 s5, 1 +; GFX7-NEXT: s_cselect_b32 s6, s1, s0 +; GFX7-NEXT: s_cmp_eq_u32 s5, 2 +; GFX7-NEXT: s_cselect_b32 s6, s2, s6 +; GFX7-NEXT: s_cmp_eq_u32 s5, 3 +; GFX7-NEXT: s_cselect_b32 s6, s3, s6 +; GFX7-NEXT: s_and_b32 s4, s4, 3 +; GFX7-NEXT: s_lshl_b32 s4, s4, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 +; GFX7-NEXT: s_lshl_b32 s4, s17, s4 +; GFX7-NEXT: s_andn2_b32 s4, s6, s4 +; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s17, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s17, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s17, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s17, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s17, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr + %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx + store <16 x i8> %insert, <16 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v16i8_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s18, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s5, s0, 8 +; GFX9-NEXT: s_and_b32 s5, s5, s18 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s18 +; GFX9-NEXT: s_lshl_b32 s5, s5, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s5 +; GFX9-NEXT: s_and_b32 s5, s7, s18 +; GFX9-NEXT: s_lshl_b32 s5, s5, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s5 +; GFX9-NEXT: s_and_b32 s5, s8, s18 +; GFX9-NEXT: s_lshl_b32 s5, s5, 24 +; GFX9-NEXT: s_lshr_b32 s9, s1, 8 +; GFX9-NEXT: s_or_b32 s8, s0, s5 +; GFX9-NEXT: s_lshr_b32 s10, s1, 16 +; GFX9-NEXT: s_lshr_b32 s11, s1, 24 +; GFX9-NEXT: s_and_b32 s0, s1, s18 +; GFX9-NEXT: s_and_b32 s1, s9, s18 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s10, s18 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s11, s18 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshr_b32 s12, s2, 8 +; GFX9-NEXT: s_or_b32 s9, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s12, s18 +; GFX9-NEXT: s_lshr_b32 s13, s2, 16 +; GFX9-NEXT: s_and_b32 s0, s2, s18 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s13, s18 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_lshr_b32 s14, s2, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s14, s18 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshr_b32 s15, s3, 8 +; GFX9-NEXT: s_or_b32 s10, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s15, s18 +; GFX9-NEXT: s_lshr_b32 s16, s3, 16 +; GFX9-NEXT: s_and_b32 s0, s3, s18 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s16, s18 +; GFX9-NEXT: s_lshr_b32 s17, s3, 24 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s17, s18 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: s_or_b32 s11, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_and_b32 s4, s4, s18 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s18 +; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX9-NEXT: v_and_or_b32 v5, v1, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: s_mov_b32 s6, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_and_or_b32 v4, v0, s18, v4 +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v4, v8, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-NEXT: v_and_or_b32 v5, v1, s18, v5 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v1, v5, v8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v5, v2, s18, v5 +; GFX9-NEXT: v_and_b32_sdwa v6, v2, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v2, v5, v6, v2 +; GFX9-NEXT: v_and_or_b32 v4, v3, s18, v4 +; GFX9-NEXT: v_and_b32_sdwa v5, v3, s18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v3, s18 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v3, v4, v5, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v16i8_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s18, 0xff +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s5, s0, 8 +; GFX8-NEXT: s_and_b32 s5, s5, s18 +; GFX8-NEXT: s_lshr_b32 s6, s0, 16 +; GFX8-NEXT: s_lshr_b32 s7, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s18 +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: s_and_b32 s5, s6, s18 +; GFX8-NEXT: s_lshl_b32 s5, s5, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s5 +; GFX8-NEXT: s_and_b32 s5, s7, s18 +; GFX8-NEXT: s_lshl_b32 s5, s5, 24 +; GFX8-NEXT: s_lshr_b32 s9, s1, 8 +; GFX8-NEXT: s_or_b32 s8, s0, s5 +; GFX8-NEXT: s_lshr_b32 s10, s1, 16 +; GFX8-NEXT: s_lshr_b32 s11, s1, 24 +; GFX8-NEXT: s_and_b32 s0, s1, s18 +; GFX8-NEXT: s_and_b32 s1, s9, s18 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s10, s18 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s11, s18 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshr_b32 s12, s2, 8 +; GFX8-NEXT: s_or_b32 s9, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s12, s18 +; GFX8-NEXT: s_lshr_b32 s13, s2, 16 +; GFX8-NEXT: s_and_b32 s0, s2, s18 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s13, s18 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_lshr_b32 s14, s2, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s14, s18 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshr_b32 s15, s3, 8 +; GFX8-NEXT: s_or_b32 s10, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s15, s18 +; GFX8-NEXT: s_lshr_b32 s16, s3, 16 +; GFX8-NEXT: s_and_b32 s0, s3, s18 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s16, s18 +; GFX8-NEXT: s_lshr_b32 s17, s3, 24 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s17, s18 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: v_mov_b32_e32 v1, s8 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: s_or_b32 s11, s0, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_and_b32 s4, s4, s18 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s4 +; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s18 +; GFX8-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_or_b32_e32 v5, v0, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v8, s18 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v16i8_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s18, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s5, s0, 8 +; GFX7-NEXT: s_and_b32 s5, s5, s18 +; GFX7-NEXT: s_lshr_b32 s6, s0, 16 +; GFX7-NEXT: s_lshr_b32 s7, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s18 +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s5 +; GFX7-NEXT: s_and_b32 s5, s6, s18 +; GFX7-NEXT: s_lshl_b32 s5, s5, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s5 +; GFX7-NEXT: s_and_b32 s5, s7, s18 +; GFX7-NEXT: s_lshl_b32 s5, s5, 24 +; GFX7-NEXT: s_lshr_b32 s9, s1, 8 +; GFX7-NEXT: s_or_b32 s8, s0, s5 +; GFX7-NEXT: s_lshr_b32 s10, s1, 16 +; GFX7-NEXT: s_lshr_b32 s11, s1, 24 +; GFX7-NEXT: s_and_b32 s0, s1, s18 +; GFX7-NEXT: s_and_b32 s1, s9, s18 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s10, s18 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s11, s18 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshr_b32 s12, s2, 8 +; GFX7-NEXT: s_or_b32 s9, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s12, s18 +; GFX7-NEXT: s_lshr_b32 s13, s2, 16 +; GFX7-NEXT: s_and_b32 s0, s2, s18 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s13, s18 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_lshr_b32 s14, s2, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s14, s18 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshr_b32 s15, s3, 8 +; GFX7-NEXT: s_or_b32 s10, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s15, s18 +; GFX7-NEXT: s_lshr_b32 s16, s3, 16 +; GFX7-NEXT: s_and_b32 s0, s3, s18 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s16, s18 +; GFX7-NEXT: s_lshr_b32 s17, s3, 24 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s17, s18 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: v_mov_b32_e32 v1, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: s_or_b32 s11, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s10 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_and_b32 s4, s4, s18 +; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, s18, v0 +; GFX7-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] +; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s18, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s18, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s18, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s18, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s18, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr + %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx + store <16 x i8> %insert, <16 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg %ptr, i8 %val, i32 %idx) { +; GFX9-LABEL: insertelement_s_v16i8_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX9-NEXT: s_movk_i32 s17, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshr_b32 s4, s0, 8 +; GFX9-NEXT: s_and_b32 s4, s4, s17 +; GFX9-NEXT: s_lshr_b32 s5, s0, 16 +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: s_and_b32 s0, s0, s17 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s4 +; GFX9-NEXT: s_and_b32 s4, s5, s17 +; GFX9-NEXT: s_lshl_b32 s4, s4, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s4 +; GFX9-NEXT: s_and_b32 s4, s6, s17 +; GFX9-NEXT: s_lshl_b32 s4, s4, 24 +; GFX9-NEXT: s_lshr_b32 s7, s1, 8 +; GFX9-NEXT: s_or_b32 s4, s0, s4 +; GFX9-NEXT: s_lshr_b32 s9, s1, 16 +; GFX9-NEXT: s_lshr_b32 s10, s1, 24 +; GFX9-NEXT: s_and_b32 s0, s1, s17 +; GFX9-NEXT: s_and_b32 s1, s7, s17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s9, s17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s10, s17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshr_b32 s11, s2, 8 +; GFX9-NEXT: s_or_b32 s5, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s11, s17 +; GFX9-NEXT: s_lshr_b32 s12, s2, 16 +; GFX9-NEXT: s_and_b32 s0, s2, s17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s12, s17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_lshr_b32 s13, s2, 24 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s13, s17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: s_lshr_b32 s14, s3, 8 +; GFX9-NEXT: s_or_b32 s6, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s14, s17 +; GFX9-NEXT: s_lshr_b32 s15, s3, 16 +; GFX9-NEXT: s_and_b32 s0, s3, s17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 8 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s15, s17 +; GFX9-NEXT: s_lshr_b32 s16, s3, 24 +; GFX9-NEXT: s_lshl_b32 s1, s1, 16 +; GFX9-NEXT: s_or_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s16, s17 +; GFX9-NEXT: s_lshl_b32 s1, s1, 24 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: s_or_b32 s7, s0, s1 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s17 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX9-NEXT: v_and_or_b32 v5, v2, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: s_mov_b32 s8, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_and_or_b32 v4, v0, s17, v4 +; GFX9-NEXT: v_and_b32_sdwa v8, v0, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v0, v0, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v0, v4, v8, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-NEXT: v_and_or_b32 v5, v1, s17, v5 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v1, v1, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v1, v5, v8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v5, v2, s17, v5 +; GFX9-NEXT: v_and_b32_sdwa v6, v2, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v2, v2, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v2, v5, v6, v2 +; GFX9-NEXT: v_and_or_b32 v4, v3, s17, v4 +; GFX9-NEXT: v_and_b32_sdwa v5, v3, s17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v3, v3, s17 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v3, v4, v5, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_s_v16i8_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX8-NEXT: s_movk_i32 s16, 0xff +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshr_b32 s4, s0, 8 +; GFX8-NEXT: s_and_b32 s4, s4, s16 +; GFX8-NEXT: s_lshr_b32 s5, s0, 16 +; GFX8-NEXT: s_lshr_b32 s6, s0, 24 +; GFX8-NEXT: s_and_b32 s0, s0, s16 +; GFX8-NEXT: s_lshl_b32 s4, s4, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s5, s16 +; GFX8-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s4, s6, s16 +; GFX8-NEXT: s_lshl_b32 s4, s4, 24 +; GFX8-NEXT: s_lshr_b32 s7, s1, 8 +; GFX8-NEXT: s_or_b32 s4, s0, s4 +; GFX8-NEXT: s_lshr_b32 s8, s1, 16 +; GFX8-NEXT: s_lshr_b32 s9, s1, 24 +; GFX8-NEXT: s_and_b32 s0, s1, s16 +; GFX8-NEXT: s_and_b32 s1, s7, s16 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s8, s16 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s9, s16 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshr_b32 s10, s2, 8 +; GFX8-NEXT: s_or_b32 s5, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s10, s16 +; GFX8-NEXT: s_lshr_b32 s11, s2, 16 +; GFX8-NEXT: s_and_b32 s0, s2, s16 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s11, s16 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_lshr_b32 s12, s2, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s12, s16 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_lshr_b32 s13, s3, 8 +; GFX8-NEXT: s_or_b32 s6, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s13, s16 +; GFX8-NEXT: s_lshr_b32 s14, s3, 16 +; GFX8-NEXT: s_and_b32 s0, s3, s16 +; GFX8-NEXT: s_lshl_b32 s1, s1, 8 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s14, s16 +; GFX8-NEXT: s_lshr_b32 s15, s3, 24 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s15, s16 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: s_or_b32 s7, s0, s1 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s16 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v8, s16 +; GFX8-NEXT: v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_and_b32_sdwa v0, v0, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v9, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX8-NEXT: v_and_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v6, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX8-NEXT: v_and_b32_sdwa v3, v3, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_s_v16i8_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 +; GFX7-NEXT: s_movk_i32 s16, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshr_b32 s4, s0, 8 +; GFX7-NEXT: s_and_b32 s4, s4, s16 +; GFX7-NEXT: s_lshr_b32 s5, s0, 16 +; GFX7-NEXT: s_lshr_b32 s6, s0, 24 +; GFX7-NEXT: s_and_b32 s0, s0, s16 +; GFX7-NEXT: s_lshl_b32 s4, s4, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s4 +; GFX7-NEXT: s_and_b32 s4, s5, s16 +; GFX7-NEXT: s_lshl_b32 s4, s4, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s4 +; GFX7-NEXT: s_and_b32 s4, s6, s16 +; GFX7-NEXT: s_lshl_b32 s4, s4, 24 +; GFX7-NEXT: s_lshr_b32 s7, s1, 8 +; GFX7-NEXT: s_or_b32 s4, s0, s4 +; GFX7-NEXT: s_lshr_b32 s8, s1, 16 +; GFX7-NEXT: s_lshr_b32 s9, s1, 24 +; GFX7-NEXT: s_and_b32 s0, s1, s16 +; GFX7-NEXT: s_and_b32 s1, s7, s16 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s8, s16 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s9, s16 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshr_b32 s10, s2, 8 +; GFX7-NEXT: s_or_b32 s5, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s10, s16 +; GFX7-NEXT: s_lshr_b32 s11, s2, 16 +; GFX7-NEXT: s_and_b32 s0, s2, s16 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s11, s16 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_lshr_b32 s12, s2, 24 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s12, s16 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: s_lshr_b32 s13, s3, 8 +; GFX7-NEXT: s_or_b32 s6, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s13, s16 +; GFX7-NEXT: s_lshr_b32 s14, s3, 16 +; GFX7-NEXT: s_and_b32 s0, s3, s16 +; GFX7-NEXT: s_lshl_b32 s1, s1, 8 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s14, s16 +; GFX7-NEXT: s_lshr_b32 s15, s3, 24 +; GFX7-NEXT: s_lshl_b32 s1, s1, 16 +; GFX7-NEXT: s_or_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s15, s16 +; GFX7-NEXT: s_lshl_b32 s1, s1, 24 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: s_or_b32 s7, s0, s1 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, s16, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] +; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v4 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s16, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i8>, <16 x i8> addrspace(4)* %ptr + %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx + store <16 x i8> %insert, <16 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i8 inreg %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v16i8_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: s_and_b32 s1, s2, s6 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX9-NEXT: v_lshlrev_b32_e64 v7, v2, s1 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v1 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v6 +; GFX9-NEXT: v_and_b32_sdwa v12, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v13, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v8 +; GFX9-NEXT: v_and_b32_sdwa v14, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v9 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v16, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v5, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v10 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v3, v3, v12, v13 +; GFX9-NEXT: v_or3_b32 v4, v4, v14, v15 +; GFX9-NEXT: v_and_b32_sdwa v18, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v19, v6, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v6, v6, s6, v11 +; GFX9-NEXT: v_or3_b32 v5, v5, v16, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v1 +; GFX9-NEXT: v_or3_b32 v6, v6, v18, v19 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v8, v2, v7 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v9, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v10, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v5 +; GFX9-NEXT: v_and_or_b32 v5, v2, s6, v0 +; GFX9-NEXT: v_and_b32_sdwa v11, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v12, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v13, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v16, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v7 +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 +; GFX9-NEXT: v_or3_b32 v0, v1, v9, v10 +; GFX9-NEXT: v_or3_b32 v1, v3, v11, v12 +; GFX9-NEXT: v_or3_b32 v2, v4, v13, v14 +; GFX9-NEXT: v_or3_b32 v3, v5, v15, v16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v16i8_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, s0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 2, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX8-NEXT: v_lshlrev_b32_e64 v9, v2, s1 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v14, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_and_b32_sdwa v17, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 +; GFX8-NEXT: v_and_b32_sdwa v18, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v19, v5, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v10, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v18 +; GFX8-NEXT: v_and_b32_sdwa v6, v6, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v5, v5, v10 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v10, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v11, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v2, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v3, v12 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v14 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v16 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v13 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v16i8_s_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_movk_i32 s6, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 2, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: s_and_b32 s0, s2, s6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX7-NEXT: v_lshl_b32_e32 v18, s0, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v17 +; GFX7-NEXT: v_lshl_b32_e32 v2, s6, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v17 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v8, s6, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_and_b32_e32 v9, s6, v9 +; GFX7-NEXT: v_and_b32_e32 v11, s6, v11 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 +; GFX7-NEXT: v_and_b32_e32 v10, s6, v10 +; GFX7-NEXT: v_and_b32_e32 v12, s6, v12 +; GFX7-NEXT: v_and_b32_e32 v14, s6, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v13, s6, v13 +; GFX7-NEXT: v_and_b32_e32 v15, s6, v15 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_and_b32_e32 v16, s6, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v7 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v4 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx + store <16 x i8> %insert, <16 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i8 %val, i32 inreg %idx) { +; GFX9-LABEL: insertelement_v_v16i8_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: s_and_b32 s1, s2, 3 +; GFX9-NEXT: s_lshl_b32 s1, s1, 3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: s_lshr_b32 s4, s2, 2 +; GFX9-NEXT: s_lshl_b32 s1, s6, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX9-NEXT: s_not_b32 s5, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v6 +; GFX9-NEXT: v_and_b32_sdwa v10, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v11, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v2, v3, s6, v2 +; GFX9-NEXT: v_and_b32_sdwa v12, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v13, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v3, v4, s6, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v14, v5, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v5, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v4, v5, s6, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or3_b32 v2, v2, v10, v11 +; GFX9-NEXT: v_or3_b32 v3, v3, v12, v13 +; GFX9-NEXT: v_and_b32_sdwa v16, v6, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v6, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v5, v6, s6, v9 +; GFX9-NEXT: v_or3_b32 v4, v4, v14, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v3, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX9-NEXT: v_or3_b32 v5, v5, v16, v17 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] +; GFX9-NEXT: v_and_or_b32 v1, v6, s5, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v1, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v9, v2, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v10, v2, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v5 +; GFX9-NEXT: v_and_or_b32 v5, v1, s6, v0 +; GFX9-NEXT: v_and_b32_sdwa v11, v3, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v12, v3, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v13, v4, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v4, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v1, s6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v16, v1, s6 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 +; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v7 +; GFX9-NEXT: v_or3_b32 v0, v2, v9, v10 +; GFX9-NEXT: v_or3_b32 v1, v3, v11, v12 +; GFX9-NEXT: v_or3_b32 v2, v4, v13, v14 +; GFX9-NEXT: v_or3_b32 v3, v5, v15, v16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v16i8_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] +; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: v_mov_b32_e32 v8, s1 +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v0, 8 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, s0 +; GFX8-NEXT: s_lshr_b32 s4, s2, 2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX8-NEXT: s_not_b32 s5, s0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 +; GFX8-NEXT: v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX8-NEXT: v_and_b32_sdwa v16, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v17, v5, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX8-NEXT: v_and_b32_sdwa v18, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v13 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX8-NEXT: v_and_b32_sdwa v19, v6, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v5, v5, v18 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v3, vcc +; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v6, s5, v6 +; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v12, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v13, v3, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v4, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v10, v0, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v11, v0, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v2, v7 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v2, v3, v12 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v14 +; GFX8-NEXT: v_or_b32_e32 v4, v1, v16 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v13 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v7 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v16i8_v_s: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_movk_i32 s6, 0xff +; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: s_lshr_b32 s4, s2, 2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s0, s6, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_and_b32_e32 v8, s6, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v2 +; GFX7-NEXT: v_and_b32_e32 v9, s6, v9 +; GFX7-NEXT: v_and_b32_e32 v11, s6, v11 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX7-NEXT: v_and_b32_e32 v7, s6, v7 +; GFX7-NEXT: v_and_b32_e32 v10, s6, v10 +; GFX7-NEXT: v_and_b32_e32 v12, s6, v12 +; GFX7-NEXT: v_and_b32_e32 v14, s6, v14 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v13, s6, v13 +; GFX7-NEXT: v_and_b32_e32 v15, s6, v15 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_and_b32_e32 v6, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_and_b32_e32 v16, s6, v16 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v14 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v13 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc +; GFX7-NEXT: v_or_b32_e32 v4, v4, v16 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v5, s5, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v1, s6, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v4 +; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v15 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx + store <16 x i8> %insert, <16 x i8> addrspace(1)* null + ret void +} + +define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i8 %val, i32 %idx) { +; GFX9-LABEL: insertelement_v_v16i8_v_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: s_movk_i32 s1, 0xff +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 2, v3 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, v3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v8 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v13, v4, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v4, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v4, v4, s1, v9 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v15, v5, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v16, v5, s1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v5, v5, s1, v10 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v17, v6, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v18, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v6, v6, s1, v11 +; GFX9-NEXT: v_and_b32_sdwa v19, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v9, v7, v0, v12 +; GFX9-NEXT: v_and_b32_sdwa v7, v7, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v4, v4, v13, v14 +; GFX9-NEXT: v_or3_b32 v5, v5, v15, v16 +; GFX9-NEXT: v_or3_b32 v7, v9, v19, v7 +; GFX9-NEXT: v_or3_b32 v6, v6, v17, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v4, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v7, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v9, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_sdwa v12, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v13, v4, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v14, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v15, v5, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_or_b32 v4, v4, v0, v7 +; GFX9-NEXT: v_and_or_b32 v5, v5, v0, v8 +; GFX9-NEXT: v_and_b32_sdwa v10, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v11, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v6 +; GFX9-NEXT: v_and_or_b32 v6, v2, v0, v1 +; GFX9-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_or3_b32 v1, v4, v12, v13 +; GFX9-NEXT: v_or3_b32 v2, v5, v14, v15 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_or3_b32 v0, v3, v10, v11 +; GFX9-NEXT: v_or3_b32 v3, v6, v16, v17 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off +; GFX9-NEXT: s_endpgm +; +; GFX8-LABEL: insertelement_v_v16i8_v_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_mov_b32_e32 v9, s0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 2, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v10 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, v3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v10 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v10 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 8, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 8, v7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_sdwa v15, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v4, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v17, v5, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v4, v5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v8, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_and_b32_sdwa v9, v5, v9 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v17 +; GFX8-NEXT: v_and_b32_sdwa v18, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v5, v6, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v19, v6, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v11, v7, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v6, v7, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v5, v5, v18 +; GFX8-NEXT: v_and_b32_sdwa v7, v7, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v6, v6, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v16 +; GFX8-NEXT: v_or_b32_e32 v4, v4, v9 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v3, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v5, v2, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_and_b32_sdwa v10, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v11, v1, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v12, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v13, v3, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v14, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v15, v4, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v17, v2, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v3, v3, v14 +; GFX8-NEXT: v_or_b32_e32 v4, v2, v16 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v10 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v4, v17 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX7-LABEL: insertelement_v_v16i8_v_v: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v8, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v19, 2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, v3, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v19 +; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 8, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 8, v6 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 8, v7 +; GFX7-NEXT: v_and_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_and_b32_e32 v11, s0, v11 +; GFX7-NEXT: v_and_b32_e32 v13, v13, v8 +; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 24, v6 +; GFX7-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX7-NEXT: v_and_b32_e32 v12, s0, v12 +; GFX7-NEXT: v_and_b32_e32 v14, v14, v8 +; GFX7-NEXT: v_and_b32_e32 v16, v16, v8 +; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 24, v7 +; GFX7-NEXT: v_and_b32_e32 v15, v15, v8 +; GFX7-NEXT: v_and_b32_e32 v17, v17, v8 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_and_b32_e32 v7, v7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v13 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX7-NEXT: v_and_b32_e32 v18, v18, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; GFX7-NEXT: v_or_b32_e32 v4, v5, v14 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v16 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX7-NEXT: v_or_b32_e32 v5, v6, v17 +; GFX7-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v15 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v19 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v18 +; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v5, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v3, v6, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v4, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v5, v2, s[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, v5, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, v6, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, v7, v8 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v1, v1, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, v9, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, v10, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7-NEXT: v_lshrrev_b32_e32 v12, 16, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v13, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v2, v3, v8 +; GFX7-NEXT: v_and_b32_e32 v3, v11, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, v12, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_and_b32_e32 v3, v13, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v14, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v15, 16, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v16, 24, v4 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v8 +; GFX7-NEXT: v_and_b32_e32 v4, v14, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, v15, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: v_and_b32_e32 v4, v16, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 +; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GFX7-NEXT: s_endpgm + %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + %insert = insertelement <16 x i8> %vec, i8 %val, i32 %idx + store <16 x i8> %insert, <16 x i8> addrspace(1)* null + ret void +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir +++ llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir @@ -1738,3 +1738,195 @@ %5:_(p1) = COPY $vgpr0_vgpr1 G_STORE %4, %5 :: (store 256, align 4, addrspace 1) ... + +--- +name: insert_vector_elt_varidx_v4s8 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: insert_vector_elt_varidx_v4s8 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]] + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]] + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C4]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[SHL3]](s32) + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[SHL3]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL5]], [[C5]] + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[OR2]], [[XOR]] + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL4]] + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[OR3]], [[C]](s32) + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[OR3]], [[C1]](s32) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[OR3]], [[C2]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[OR3]](s32) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]] + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]] + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL6]] + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]] + ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C1]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL7]] + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY11]], [[C3]] + ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C2]](s32) + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL8]] + ; CHECK: $vgpr0 = COPY [[OR6]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(<4 x s8>) = G_BITCAST %0 + %4:_(s8) = G_TRUNC %1 + %5:_(<4 x s8>) = G_INSERT_VECTOR_ELT %3, %4, %2 + %6:_(s32) = G_BITCAST %5 + $vgpr0 = COPY %6 +... + +--- +name: insert_vector_elt_varidx_v8s8 + +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3 + ; CHECK-LABEL: name: insert_vector_elt_varidx_v8s8 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) + ; CHECK: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32) + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; CHECK: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; CHECK: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C1]](s16) + ; CHECK: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C1]](s16) + ; CHECK: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC2]], [[C1]](s16) + ; CHECK: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C1]](s16) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]] + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C2]] + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]] + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 + ; CHECK: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32) + ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]] + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C2]] + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR4]](s16) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]] + ; CHECK: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C3]](s32) + ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]] + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C2]] + ; CHECK: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32) + ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]] + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR5]](s16) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]] + ; CHECK: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32) + ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]] + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C5]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<2 x s32>), [[LSHR6]](s32) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C6]] + ; CHECK: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C6]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C2]] + ; CHECK: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[SHL6]](s32) + ; CHECK: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[SHL6]](s32) + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SHL8]], [[C7]] + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[EVEC]], [[XOR]] + ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL7]] + ; CHECK: [[IVEC:%[0-9]+]]:_(<2 x s32>) = G_INSERT_VECTOR_ELT [[BUILD_VECTOR]], [[OR6]](s32), [[LSHR6]](s32) + ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[IVEC]](<2 x s32>) + ; CHECK: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C3]](s32) + ; CHECK: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32) + ; CHECK: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C4]](s32) + ; CHECK: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32) + ; CHECK: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32) + ; CHECK: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C4]](s32) + ; CHECK: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CHECK: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[UV2]](s32) + ; CHECK: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C8]] + ; CHECK: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32) + ; CHECK: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C8]] + ; CHECK: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND12]], [[C1]](s16) + ; CHECK: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND11]], [[SHL9]] + ; CHECK: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32) + ; CHECK: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C8]] + ; CHECK: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32) + ; CHECK: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C8]] + ; CHECK: [[SHL10:%[0-9]+]]:_(s16) = G_SHL [[AND14]], [[C1]](s16) + ; CHECK: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND13]], [[SHL10]] + ; CHECK: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[UV3]](s32) + ; CHECK: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C8]] + ; CHECK: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32) + ; CHECK: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C8]] + ; CHECK: [[SHL11:%[0-9]+]]:_(s16) = G_SHL [[AND16]], [[C1]](s16) + ; CHECK: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND15]], [[SHL11]] + ; CHECK: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32) + ; CHECK: [[AND17:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C8]] + ; CHECK: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32) + ; CHECK: [[AND18:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C8]] + ; CHECK: [[SHL12:%[0-9]+]]:_(s16) = G_SHL [[AND18]], [[C1]](s16) + ; CHECK: [[OR10:%[0-9]+]]:_(s16) = G_OR [[AND17]], [[SHL12]] + ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16) + ; CHECK: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16) + ; CHECK: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32) + ; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL13]] + ; CHECK: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16) + ; CHECK: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16) + ; CHECK: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32) + ; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL14]] + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR11]](s32), [[OR12]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s32) = COPY $vgpr3 + %3:_(<8 x s8>) = G_BITCAST %0 + %4:_(s8) = G_TRUNC %1 + %5:_(<8 x s8>) = G_INSERT_VECTOR_ELT %3, %4, %2 + %6:_(s64) = G_BITCAST %5 + $vgpr0_vgpr1 = COPY %6 +...