diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5733,6 +5733,46 @@ return DAG.getUNDEF(ASC->getValueType(0)); } +static SDValue InsertVecElt(SDValue Vec, SDValue InsVal, SDValue Idx, SDLoc &SL, + SelectionDAG &DAG) { + if (isa(Idx)) + return SDValue(); + + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned VecSize = VecVT.getSizeInBits(); + unsigned EltSize = EltVT.getSizeInBits(); + + assert(VecSize <= 64); + + MVT IntVT = MVT::getIntegerVT(VecSize); + + // Avoid stack access for dynamic indexing. + // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec + + // Create a congruent vector with the target value in each element so that + // the required element can be masked and ORed into the target vector. + SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, + DAG.getSplatBuildVector(VecVT, SL, InsVal)); + + assert(isPowerOf2_32(EltSize)); + SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); + + // Convert vector index to bit-index. + SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); + + SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); + SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, + DAG.getConstant(0xffff, SL, IntVT), ScaledIdx); + + SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); + SDValue RHS = + DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec); + + SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS); + return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); +} + // This lowers an INSERT_SUBVECTOR by extracting the individual elements from // the small vector and inserting them into the big vector. That is better than // the default expansion of doing it via a stack slot. Even though the use of @@ -5746,6 +5786,7 @@ EVT VecVT = Vec.getValueType(); EVT InsVT = Ins.getValueType(); EVT EltVT = VecVT.getVectorElementType(); + unsigned VecSize = VecVT.getSizeInBits(); unsigned InsNumElts = InsVT.getVectorNumElements(); unsigned IdxVal = cast(Idx)->getZExtValue(); SDLoc SL(Op); @@ -5753,9 +5794,18 @@ for (unsigned I = 0; I != InsNumElts; ++I) { SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, DAG.getConstant(I, SL, MVT::i32)); - Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt, - DAG.getConstant(IdxVal + I, SL, MVT::i32)); + SDValue CurIdx = DAG.getConstant(IdxVal + I, SL, MVT::i32); + if (VecSize <= 64) { + // We can directly custom lower when the target vector size is <= 64 + // instead of again taking it through INSERT_VECTOR_ELT. + Vec = InsertVecElt(Vec, Elt, CurIdx, SL, DAG); + } else { + // TODO: These cases really are not getting custom lowered at the moment. + // We should custom lower all possible vec sizes in a generic way. + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt, CurIdx); + } } + return Vec; } @@ -5769,7 +5819,6 @@ unsigned VecSize = VecVT.getSizeInBits(); unsigned EltSize = EltVT.getSizeInBits(); - assert(VecSize <= 64); unsigned NumElts = VecVT.getVectorNumElements(); @@ -5803,36 +5852,7 @@ return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat); } - if (isa(Idx)) - return SDValue(); - - MVT IntVT = MVT::getIntegerVT(VecSize); - - // Avoid stack access for dynamic indexing. - // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - - // Create a congruent vector with the target value in each element so that - // the required element can be masked and ORed into the target vector. - SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, - DAG.getSplatBuildVector(VecVT, SL, InsVal)); - - assert(isPowerOf2_32(EltSize)); - SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); - - // Convert vector index to bit-index. - SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor); - - SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec); - SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT, - DAG.getConstant(0xffff, SL, IntVT), - ScaledIdx); - - SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal); - SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT, - DAG.getNOT(SL, BFM, IntVT), BCVec); - - SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS); - return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI); + return InsertVecElt(Vec, InsVal, Idx, SL, DAG); } SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -1295,12 +1295,14 @@ define amdgpu_kernel void @fma_shuffle(<4 x half> addrspace(1)* nocapture readonly %A, <4 x half> addrspace(1)* nocapture readonly %B, <4 x half> addrspace(1)* nocapture %C) { ; GFX9-LABEL: fma_shuffle: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] -; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] ; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] @@ -1312,14 +1314,16 @@ ; ; GFX10-LABEL: fma_shuffle: ; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_add_u32 s0, s0, s7 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] -; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] +; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11] ; GFX10-NEXT: global_load_dwordx2 v[4:5], v6, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1]