Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -152,6 +152,7 @@ SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -679,6 +679,7 @@ setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -8114,6 +8115,42 @@ return SDValue(); } +SDValue SITargetLowering::performInsertVectorEltCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDValue Vec = N->getOperand(0); + SDValue Idx = N->getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + + // INSERT_VECTOR_ELT (, var-idx) + // => BUILD_VECTOR n x select (e, const-idx) + // This elminates non-constant index and subsequent movrel or scratch access. + // Sub-dword vectors of size 2 dword or less have better implementation. + // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 + // instructions. + if (isa(Idx) || + VecSize > 256 || (VecSize <= 64 && EltSize < 32)) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + SDValue Ins = N->getOperand(1); + EVT IdxVT = Idx.getValueType(); + + SDValue V; + SmallVector Ops; + for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { + SDValue IC = DAG.getConstant(I, SL, IdxVT); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); + SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ); + Ops.push_back(V); + } + + return DAG.getBuildVector(VecVT, SL, Ops); +} + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -8722,6 +8759,8 @@ } case ISD::EXTRACT_VECTOR_ELT: return performExtractVectorEltCombine(N, DCI); + case ISD::INSERT_VECTOR_ELT: + return performInsertVectorEltCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } Index: test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll @@ -9,11 +9,14 @@ ; CHECK: s_load_dword [[IN:s[0-9]+]] ; CHECK: s_mov_b32 m0, [[IN]] ; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]] -; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]: -define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { +; CHECK: buffer_store_dwordx4 +; CHECK: buffer_store_dwordx4 +; CHECK: buffer_store_dwordx4 +; CHECK: buffer_store_dwordx4 +define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) { entry: - %ins = insertelement <4 x float> , float 5.0, i32 %in - store <4 x float> %ins, <4 x float> addrspace(1)* %out + %ins = insertelement <16 x float> , float 17.0, i32 %in + store <16 x float> %ins, <16 x float> addrspace(1)* %out ret void } Index: test/CodeGen/AMDGPU/indirect-addressing-si.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-addressing-si.ll +++ test/CodeGen/AMDGPU/indirect-addressing-si.ll @@ -182,15 +182,16 @@ ; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0 ; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000 ; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0 -; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x40a00000 +; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000 +; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000 ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]] ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}} -define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) { entry: - %0 = add i32 %in, 1 - %1 = insertelement <4 x float> , float 5.0, i32 %0 - store <4 x float> %1, <4 x float> addrspace(1)* %out + %add = add i32 %in, 1 + %ins = insertelement <16 x float> , float 17.0, i32 %add + store <16 x float> %ins, <16 x float> addrspace(1)* %out ret void } @@ -205,27 +206,27 @@ ; IDXMODE-NEXT: s_set_gpr_idx_off ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]: -define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) { +define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) { entry: - %0 = insertelement <4 x float> , float 5.0, i32 %in - store <4 x float> %0, <4 x float> addrspace(1)* %out + %ins = insertelement <16 x float> , float 17.0, i32 %in + store <16 x float> %ins, <16 x float> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}insert_neg_offset_sgpr: ; The offset depends on the register that holds the first element of the vector. ; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}} -; MOVREL: v_movreld_b32_e32 v0, 5 +; MOVREL: v_movreld_b32_e32 v0, 16 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst -; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 +; IDXMODE-NEXT: v_mov_b32_e32 v0, 16 ; IDXMODE-NEXT: s_set_gpr_idx_off -define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) { +define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, i32 %offset) { entry: %index = add i32 %offset, -512 - %value = insertelement <4 x i32> , i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out + %value = insertelement <16 x i32> , i32 16, i32 %index + store <16 x i32> %value, <16 x i32> addrspace(1)* %out ret void } @@ -241,11 +242,11 @@ ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5 ; IDXMODE-NEXT: s_set_gpr_idx_off -define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) { +define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) { entry: %index = add i32 %offset, -512 - %value = insertelement <4 x i32> %vec, i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out + %value = insertelement <16 x i32> %vec, i32 5, i32 %index + store <16 x i32> %value, <16 x i32> addrspace(1)* %out ret void } @@ -256,6 +257,18 @@ ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}} ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}} ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}} ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec ; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]: @@ -263,23 +276,23 @@ ; GCN: s_and_saveexec_b64 vcc, vcc ; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00 -; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5 +; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 33 ; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}} ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst -; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5 +; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 33 ; IDXMODE: s_set_gpr_idx_off ; GCN: s_cbranch_execnz [[LOOPBB]] ; GCN: s_mov_b64 exec, [[SAVEEXEC]] ; GCN: buffer_store_dword -define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -512 - %value = insertelement <4 x i32> , i32 5, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out + %value = insertelement <16 x i32> , i32 33, i32 %index + store <16 x i32> %value, <16 x i32> addrspace(1)* %out ret void } @@ -289,6 +302,18 @@ ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}} ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}} ; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}} +; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}} ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}} ; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec @@ -305,12 +330,12 @@ ; IDXMODE: s_set_gpr_idx_off ; GCN: s_cbranch_execnz -define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %index = add i32 %id, -16 - %value = insertelement <4 x i32> , i32 500, i32 %index - store <4 x i32> %value, <4 x i32> addrspace(1)* %out + %value = insertelement <16 x i32> , i32 500, i32 %index + store <16 x i32> %value, <16 x i32> addrspace(1)* %out ret void } @@ -393,13 +418,11 @@ } ; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block: -; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}} +; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}} ; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]] ; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62 -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]] -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}} -; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]] ; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]] ; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]: @@ -429,10 +452,10 @@ ; GCN: s_and_saveexec_b64 vcc, vcc ; MOVREL: s_mov_b32 m0, [[READLANE]] -; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63 +; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63 ; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst -; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63 +; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63 ; IDXMODE: s_set_gpr_idx_off ; GCN-NEXT: s_xor_b64 exec, exec, vcc @@ -441,7 +464,7 @@ ; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]: ; GCN: buffer_store_dword [[INS0]] -define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 { +define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 { entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 %id.ext = zext i32 %id to i64 @@ -449,9 +472,9 @@ %idx0 = load volatile i32, i32 addrspace(1)* %gep %idx1 = add i32 %idx0, 1 %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"() - %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0 - %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1 - store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0 + %vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0 + %vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1 + store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0 %cmp = icmp eq i32 %id, 0 br i1 %cmp, label %bb1, label %bb2 @@ -495,10 +518,10 @@ ; GCN: s_load_dword [[ARG:s[0-9]+]] ; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000 -; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd ; MOVREL: s_waitcnt ; MOVREL: s_add_i32 m0, [[ARG]], -16 ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0 +; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd ; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0 ; MOVREL: s_mov_b32 m0, -1 @@ -520,13 +543,13 @@ define amdgpu_kernel void @multi_same_block(i32 %arg) #0 { bb: %tmp1 = add i32 %arg, -16 - %tmp2 = insertelement <6 x float> , float 4.000000e+00, i32 %tmp1 + %tmp2 = insertelement <9 x float> , float 4.000000e+00, i32 %tmp1 %tmp3 = add i32 %arg, -16 - %tmp4 = insertelement <6 x float> , float -4.0, i32 %tmp3 - %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32> - %tmp6 = extractelement <6 x i32> %tmp5, i32 1 - %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32> - %tmp8 = extractelement <6 x i32> %tmp7, i32 5 + %tmp4 = insertelement <9 x float> , float -4.0, i32 %tmp3 + %tmp5 = bitcast <9 x float> %tmp2 to <9 x i32> + %tmp6 = extractelement <9 x i32> %tmp5, i32 1 + %tmp7 = bitcast <9 x float> %tmp4 to <9 x i32> + %tmp8 = extractelement <9 x i32> %tmp7, i32 5 store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4 store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4 ret void @@ -598,7 +621,7 @@ ret void } -; GCN-LABEL: {{^}}insertelement_v4f32_or_index: +; GCN-LABEL: {{^}}insertelement_v16f32_or_index: ; GCN: s_load_dword [[IDX_IN:s[0-9]+]] ; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]] ; GCN-NOT: [[IDX_SHL]] @@ -609,11 +632,11 @@ ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} ; IDXMODE: s_set_gpr_idx_off -define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind { +define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind { %idx.shl = shl i32 %idx.in, 2 %idx = or i32 %idx.shl, 1 - %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx - store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 + %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx + store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 ret void } @@ -648,9 +671,9 @@ bb4: ; preds = %bb2 %vgpr = load volatile i32, i32 addrspace(1)* undef - %tmp5 = insertelement <8 x i32> undef, i32 undef, i32 %vgpr - %tmp6 = insertelement <8 x i32> %tmp5, i32 %arg1, i32 %vgpr - %tmp7 = extractelement <8 x i32> %tmp6, i32 0 + %tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr + %tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr + %tmp7 = extractelement <16 x i32> %tmp6, i32 0 br label %bb2 bb8: ; preds = %bb2 Index: test/CodeGen/AMDGPU/insert_vector_dynelt.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -0,0 +1,296 @@ +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s + +; GCN-LABEL: {{^}}float4_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] +; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] +define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel + store <4 x float> %v, <4 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}int4_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1, v{{[0-9]+}}, [[CC4]] +; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] +define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <4 x i32> undef, i32 1, i32 %sel + store <4 x i32> %v, <4 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float2_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]] +; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] +define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <2 x float> undef, float 1.000000e+00, i32 %sel + store <2 x float> %v, <2 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float8_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4 +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]] +; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]] +; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]] +define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <8 x float> undef, float 1.000000e+00, i32 %sel + store <8 x float> %v, <8 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}float16_inselt: +; GCN: v_movreld_b32 +define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <16 x float> undef, float 1.000000e+00, i32 %sel + store <16 x float> %v, <16 x float> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}half4_inselt: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00 +define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <4 x half> undef, half 1.000000e+00, i32 %sel + store <4 x half> %v, <4 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}half2_inselt: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] +; GCN: s_and_b32 s{{[0-9]+}}, [[V]], 0x3c00 +define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <2 x half> undef, half 1.000000e+00, i32 %sel + store <2 x half> %v, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}half8_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <8 x half> undef, half 1.000000e+00, i32 %sel + store <8 x half> %v, <8 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}short2_inselt: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] +; GCN: s_and_b32 s{{[0-9]+}}, [[V]], 1 +define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <2 x i16> undef, i16 1, i32 %sel + store <2 x i16> %v, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}short4_inselt: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <4 x i16> undef, i16 1, i32 %sel + store <4 x i16> %v, <4 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}byte8_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3 +; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] +; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <8 x i8> undef, i8 1, i32 %sel + store <8 x i8> %v, <8 x i8> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}byte16_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0 +; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_cndmask_b32_e32 +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +; GCN-DAG: v_or_b32_sdwa +define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <16 x i8> undef, i8 1, i32 %sel + store <16 x i8> %v, <16 x i8> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double2_inselt: +; GCN-NOT: v_movrel +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <2 x double> undef, double 1.000000e+00, i32 %sel + store <2 x double> %v, <2 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}double8_inselt: +; GCN-NOT: v_cndmask +; GCN: buffer_store_dword +; GCN: buffer_store_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +; GCN: buffer_load_dword +define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <8 x double> undef, double 1.000000e+00, i32 %sel + store <8 x double> %v, <8 x double> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}bit4_inselt: +; GCN: buffer_store_byte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +; GCN: buffer_load_ubyte +define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <4 x i1> undef, i1 1, i32 %sel + store <4 x i1> %v, <4 x i1> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}bit128_inselt: +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f +; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]] +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]] +define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, i32 %sel) { +entry: + %v = insertelement <128 x i1> undef, i1 1, i32 %sel + store <128 x i1> %v, <128 x i1> addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/insert_vector_elt.ll =================================================================== --- test/CodeGen/AMDGPU/insert_vector_elt.ll +++ test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -83,8 +83,11 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v2f32: -; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 -; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] +; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]] ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b @@ -93,9 +96,14 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v3f32: -; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 -; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] -; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: +; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]] +; GCN-DAG: buffer_store_dwordx2 v ; GCN-DAG: buffer_store_dword v define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b @@ -104,8 +112,15 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v4f32: -; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 -; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] +; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC4]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]] ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]: define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b @@ -114,7 +129,11 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v8f32: -; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}} +; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 +; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CCL]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]] ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { @@ -136,8 +155,11 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32: -; GCN: v_movreld_b32 -; GCN: buffer_store_dwordx2 +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5, v{{[0-9]+}}, [[CC1]] +; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { %vecins = insertelement <2 x i32> %a, i32 5, i32 %b store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 @@ -145,8 +167,13 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v3i32: -; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5 -; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]: +; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: buffer_store_dwordx2 v ; GCN-DAG: buffer_store_dword v define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { %vecins = insertelement <3 x i32> %a, i32 5, i32 %b @@ -156,8 +183,15 @@ ; GCN-LABEL: {{^}}dynamic_insertelement_v4i32: ; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}} -; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] -; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]] +; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC4]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC3]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC2]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC1]] ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b @@ -166,7 +200,10 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v8i32: -; GCN: v_movreld_b32 +; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]] ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { @@ -288,24 +325,13 @@ ; GCN: s_load_dwordx4 ; GCN: s_load_dword s -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte -; GCN: buffer_store_byte - -; GCN: buffer_store_byte +; GCN-NOT: buffer_store_byte + +; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 15 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]] +; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]] + ; GCN: buffer_store_dwordx4 define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { %vecins = insertelement <16 x i8> %a, i8 5, i32 %b @@ -343,23 +369,18 @@ ; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}} ; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}} -; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}} - ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}} ; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000 -; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]] -; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0 - -; Increment to next element folded into base register, but FileCheck -; can't do math expressions - -; FIXME: Should be able to manipulate m0 directly instead of s_lshl_b32 + copy to m0 - -; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm @@ -371,8 +392,12 @@ ; GCN-LABEL: {{^}}dynamic_insertelement_v2i64: -; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 5 -; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0 +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm @@ -383,34 +408,41 @@ } ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64: +; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC3]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { %vecins = insertelement <3 x i64> %a, i64 5, i32 %b store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32 ret void } -; FIXME: Should be able to do without stack access. The used stack -; space is also 2x what should be required. - ; GCN-LABEL: {{^}}dynamic_insertelement_v4f64: -; Stack store - -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}} - -; Write element -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}} - -; Stack reload -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}} -; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}} +; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40200000 +; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC4]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC4]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC3]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC2]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] +; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0 +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC1]] +; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] -; Store result ; GCN: buffer_store_dwordx4 ; GCN: buffer_store_dwordx4 ; GCN: s_endpgm -; GCN: ScratchSize: 64 +; GCN: ScratchSize: 0 define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { %vecins = insertelement <4 x double> %a, double 8.0, i32 %b Index: test/CodeGen/AMDGPU/movreld-bug.ll =================================================================== --- test/CodeGen/AMDGPU/movreld-bug.ll +++ test/CodeGen/AMDGPU/movreld-bug.ll @@ -7,8 +7,8 @@ ; GCN: ; return define amdgpu_ps float @main(i32 inreg %arg) #0 { main_body: - %tmp24 = insertelement <2 x float> undef, float 0.000000e+00, i32 %arg - %tmp25 = extractelement <2 x float> %tmp24, i32 1 + %tmp24 = insertelement <16 x float> undef, float 0.000000e+00, i32 %arg + %tmp25 = extractelement <16 x float> %tmp24, i32 1 ret float %tmp25 } Index: test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll =================================================================== --- test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll +++ test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll @@ -1,11 +1,11 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s ; GCN-LABEL: {{^}}float4_alloca_store4: ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4 -; GFX-NOT: buffer_ +; GCN-NOT: buffer_ ; GCN: v_cndmask_b32 ; GCN: v_cndmask_b32 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0, @@ -36,11 +36,16 @@ ; GCN-LABEL: {{^}}float4_alloca_load4: ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4 -; GFX-NOT: buffer_ -; GCN: v_readfirstlane_b32 -; GFX8: v_movreld_b32 -; GFX9: s_set_gpr_idx_on -; GFX9: s_set_gpr_idx_off +; GCN-NOT: buffer_ +; GCN-DAG: v_cmp_ne_u32_e32 [[CC1:[^,]+]], 3, [[IDX:v[0-9]+]] +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] +; GCN-DAG: v_cmp_ne_u32_e32 [[CC2:[^,]+]], 2, [[IDX]] +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] +; GCN-DAG: v_cmp_ne_u32_e32 [[CC3:[^,]+]], 1, [[IDX]] +; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] +; GCN-DAG: v_cmp_ne_u32_e32 [[CC4:[^,]+]], 0, [[IDX]] +; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] +; GCN: store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] ; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2 ; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca @@ -68,7 +73,7 @@ ; GCN-LABEL: {{^}}half4_alloca_store4: ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4 -; GFX-NOT: buffer_ +; GCN-NOT: buffer_ ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]] @@ -98,7 +103,7 @@ ; GCN-LABEL: {{^}}half4_alloca_load4: ; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4 -; GFX-NOT: buffer_ +; GCN-NOT: buffer_ ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff @@ -128,7 +133,7 @@ ; GCN-LABEL: {{^}}short4_alloca_store4: ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4 -; GFX-NOT: buffer_ +; GCN-NOT: buffer_ ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]] @@ -158,7 +163,7 @@ ; GCN-LABEL: {{^}}short4_alloca_load4: ; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4 -; GFX-NOT: buffer_ +; GCN-NOT: buffer_ ; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff Index: test/CodeGen/AMDGPU/vector-extract-insert.ll =================================================================== --- test/CodeGen/AMDGPU/vector-extract-insert.ll +++ test/CodeGen/AMDGPU/vector-extract-insert.ll @@ -27,7 +27,10 @@ ; GCN-LABEL: {{^}}extract_insert_different_dynelt_v4i32: ; GCN: buffer_load_dwordx4 -; GCN: v_movreld_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 +; GCN: v_cndmask_b32 ; GCN: v_cndmask_b32 ; GCN: v_cndmask_b32 ; GCN: v_cndmask_b32