Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -299,6 +299,13 @@ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom); + + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom); @@ -4072,14 +4079,18 @@ SDValue InsVal = Op.getOperand(1); SDValue Idx = Op.getOperand(2); EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + - assert(VecVT.getScalarSizeInBits() == 16); + assert(VecSize <= 64); unsigned NumElts = VecVT.getVectorNumElements(); SDLoc SL(Op); auto KIdx = dyn_cast(Idx); - if (NumElts == 4 && KIdx) { + if (NumElts == 4 && EltSize == 16 && KIdx) { SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec); SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec, @@ -4106,22 +4117,24 @@ return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); } - assert(NumElts == 2 || NumElts == 4); - if (isa(Idx)) return SDValue(); - EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64; + MVT IntVT = MVT::getIntegerVT(VecSize); // Avoid stack access for dynamic indexing. - SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); + SDValue Val = InsVal; + if (InsVal.getValueType() == MVT::f16) + Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); + assert(isPowerOf2_32(EltSize)); + SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); + // Convert vector index to bit-index. - SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, - DAG.getConstant(4, SL, MVT::i32)); + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, @@ -4144,8 +4157,9 @@ SDValue Vec = Op.getOperand(0); SDValue Idx = Op.getOperand(1); EVT VecVT = Vec.getValueType(); - unsigned NumElts = VecVT.getVectorNumElements(); - assert(VecVT.getScalarSizeInBits() == 16 && (NumElts == 2 || NumElts == 4)); + unsigned VecSize = VecVT.getSizeInBits(); + EVT EltVT = VecVT.getVectorElementType(); + assert(VecSize <= 64); DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr); @@ -4156,11 +4170,14 @@ if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; - EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64; - SDValue Four = DAG.getConstant(4, SL, MVT::i32); + unsigned EltSize = EltVT.getSizeInBits(); + assert(isPowerOf2_32(EltSize)); + + MVT IntVT = MVT::getIntegerVT(VecSize); + SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); - // Convert vector index to bit-index (* 16) - SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Four); + // Convert vector index to bit-index (* EltSize) + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx); Index: test/CodeGen/AMDGPU/amdgpu.private-memory.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -252,8 +252,8 @@ ; R600-VECT: MOVA_INT -; SI-PROMOTE-VECT-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: -; SI-PROMOTE-VECT-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: +; SI-PROMOTE-VECT-DAG: s_lshl_b32 +; SI-PROMOTE-VECT-DAG: v_lshrrev ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0 Index: test/CodeGen/AMDGPU/extract_vector_elt-i8.ll =================================================================== --- test/CodeGen/AMDGPU/extract_vector_elt-i8.ll +++ test/CodeGen/AMDGPU/extract_vector_elt-i8.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; FUNC-LABEL: {{^}}extract_vector_elt_v1i8: +; GCN-LABEL: {{^}}extract_vector_elt_v1i8: ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 { @@ -10,7 +10,7 @@ ret void } -; FUNC-LABEL: {{^}}extract_vector_elt_v2i8: +; GCN-LABEL: {{^}}extract_vector_elt_v2i8: ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte @@ -24,7 +24,7 @@ ret void } -; FUNC-LABEL: {{^}}extract_vector_elt_v3i8: +; GCN-LABEL: {{^}}extract_vector_elt_v3i8: ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte @@ -38,7 +38,7 @@ ret void } -; FUNC-LABEL: {{^}}extract_vector_elt_v4i8: +; GCN-LABEL: {{^}}extract_vector_elt_v4i8: ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte @@ -52,7 +52,7 @@ ret void } -; FUNC-LABEL: {{^}}extract_vector_elt_v8i8: +; GCN-LABEL: {{^}}extract_vector_elt_v8i8: ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte @@ -66,7 +66,7 @@ ret void } -; FUNC-LABEL: {{^}}extract_vector_elt_v16i8: +; GCN-LABEL: {{^}}extract_vector_elt_v16i8: ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte @@ -80,7 +80,7 @@ ret void } -; FUNC-LABEL: {{^}}extract_vector_elt_v32i8: +; GCN-LABEL: {{^}}extract_vector_elt_v32i8: ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte @@ -94,7 +94,7 @@ ret void } -; FUNC-LABEL: {{^}}extract_vector_elt_v64i8: +; GCN-LABEL: {{^}}extract_vector_elt_v64i8: ; GCN: buffer_load_ubyte ; GCN: buffer_load_ubyte ; GCN: buffer_store_byte @@ -108,18 +108,32 @@ ret void } -; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v3i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte +; FIXME: SI generates much worse code from that's a pain to match -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte +; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8: +; VI-DAG: buffer_load_ushort [[LOAD:v[0-9]+]], +; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 -; GCN: buffer_store_byte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte +; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 +; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[LOAD]] +; VI: buffer_store_byte [[EXTRACT]] +define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 { + %elt = extractelement <2 x i8> %foo, i32 %idx + store i8 %elt, i8 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8: +; VI-DAG: buffer_load_ubyte [[LOAD2:v[0-9]+]], +; VI-DAG: buffer_load_ushort [[LOAD01:v[0-9]+]], +; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 + +; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 + +; VI: v_lshlrev_b32_e32 [[ELT2:v[0-9]+]], 16, [[LOAD2]] +; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[LOAD01]], [[ELT2]] +; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]] +; VI: buffer_store_byte [[EXTRACT]] define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 { %p0 = extractelement <3 x i8> %foo, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -127,20 +141,14 @@ ret void } -; FUNC-LABEL: {{^}}dynamic_extract_vector_elt_v4i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte - -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte +; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8: +; VI-DAG: s_load_dword [[VEC3:s[0-9]+]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30 -; GCN: buffer_store_byte -; GCN: buffer_load_ubyte -; GCN: buffer_store_byte +; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 +; VI: s_lshr_b32 [[EXTRACT:s[0-9]+]], [[VEC3]], [[SCALED_IDX]] +; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]] +; VI: buffer_store_byte [[V_EXTRACT]] define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 { %p0 = extractelement <4 x i8> %foo, i32 %idx %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 @@ -148,4 +156,19 @@ ret void } +; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8: +; VI-DAG: s_load_dwordx2 [[VEC3:s\[[0-9]+:[0-9]+\]]], s[0:1], 0x2c +; VI-DAG: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34 + +; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 +; VI: s_lshr_b64 s{{\[}}[[EXTRACT_LO:[0-9]+]]:{{[0-9]+\]}}, [[VEC3]], [[SCALED_IDX]] +; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], s[[EXTRACT_LO]] +; VI: buffer_store_byte [[V_EXTRACT]] +define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo, i32 %idx) #0 { + %p0 = extractelement <8 x i8> %foo, i32 %idx + %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1 + store i8 %p0, i8 addrspace(1)* %out + ret void +} + attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/function-returns.ll =================================================================== --- test/CodeGen/AMDGPU/function-returns.ll +++ test/CodeGen/AMDGPU/function-returns.ll @@ -349,10 +349,9 @@ ; FIXME: Should pack ; GCN-LABEL: {{^}}v4i8_func_void: ; GCN: buffer_load_dword v0 +; GCN-DAG: v_lshrrev_b32_e32 v1, 8, v0 ; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0 ; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0 -; CI-DAG: v_bfe_u32 v1, v0, 8, 8 -; GFX89-DAG: v_lshrrev_b16_e32 v1, 8, v0 ; GCN: s_setpc_b64 define <4 x i8> @v4i8_func_void() #0 { %ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(4)* undef Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -202,19 +202,15 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v2i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN: buffer_load_ubyte v{{[0-9]+}}, off - -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 - -; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} - -; GCN-NO-TONGA: buffer_load_ubyte -; GCN-NO-TONGA: buffer_load_ubyte -; GCN-TONGA: buffer_load_ushort - -; GCN: buffer_store_short v{{[0-9]+}}, off +; VI: buffer_load_ushort [[LOAD:v[0-9]]] +; VI: s_load_dword [[IDX:s[0-9]]] +; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 +; VI: v_lshlrev_b16_e64 [[SHL:v[0-9]+]], [[SCALED_IDX]], -1 +; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[SHL]] +; VI: v_and_b32_e32 [[AND0:v[0-9]+]], 5, [[SHL]] +; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[LOAD]] +; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[AND0]], [[AND1]] +; VI: buffer_store_short [[OR]] define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind { %vecins = insertelement <2 x i8> %a, i8 5, i32 %b store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 @@ -222,22 +218,16 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v3i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN: buffer_load_ubyte v{{[0-9]+}}, off - -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 - -; GCN-NO-TONGA: buffer_load_ubyte -; GCN-NO-TONGA: buffer_load_ubyte -; GCN-NO-TONGA: buffer_load_ubyte -; GCN-TONGA: buffer_load_ushort -; GCN-TONGA: buffer_load_ubyte - -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off -; GCN-DAG: buffer_store_short v{{[0-9]+}}, off +; VI: buffer_load_ubyte +; VI: buffer_load_ushort +; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 3 +; VI: s_lshl_b32 s{{[0-9]+}}, 0xffff, +; VI: s_not_b32 +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_or_b32_e32 +; VI: v_and_b32 +; VI: v_bfi_b32 +; VI: v_lshrrev_b32 define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind { %vecins = insertelement <3 x i8> %a, i8 5, i32 %b store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 @@ -245,25 +235,12 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v4i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN: buffer_load_ubyte v{{[0-9]+}}, off -; GCN: buffer_load_ubyte v{{[0-9]+}}, off - -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:7 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5 -; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 - -; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} - -; GCN-NO-TONGA: buffer_load_ubyte -; GCN-NO-TONGA: buffer_load_ubyte -; GCN-NO-TONGA: buffer_load_ubyte -; GCN-NO-TONGA: buffer_load_ubyte -; GCN-TONGA: buffer_load_dword - -; GCN: buffer_store_dword v{{[0-9]+}}, off +; VI: s_load_dword [[VEC:s[0-9]+]] +; VI: s_load_dword [[IDX:s[0-9]]] +; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 +; VI-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] +; VI-DAG: v_mov_b32_e32 [[V_VEC:v[0-9]+]], [[VEC]] +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[MASK]], 5, [[V_VEC]] define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind { %vecins = insertelement <4 x i8> %a, i8 5, i32 %b store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 @@ -271,6 +248,19 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v8i8: +; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; VI: s_load_dword [[IDX:s[0-9]]] +; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3 +; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0 +; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff +; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] +; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}} +; VI: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[NOT_MASK]], [[VEC]] +; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5 +; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]] +; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]] +; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]] +; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}} define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind { %vecins = insertelement <8 x i8> %a, i8 5, i32 %b store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 @@ -278,6 +268,42 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8: +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte + +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte +; GCN: buffer_store_byte + +; GCN: buffer_store_byte +; GCN: buffer_store_dwordx4 define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { %vecins = insertelement <16 x i8> %a, i8 5, i32 %b store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16