diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1423,32 +1423,42 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { - + SDLoc SL(Op); SmallVector Args; unsigned Start = cast(Op.getOperand(1))->getZExtValue(); EVT VT = Op.getValueType(); EVT SrcVT = Op.getOperand(0).getValueType(); - // For these types, we have some TableGen patterns except if the index is 1 - if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) || - (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) && - Start != 1) - return Op; + if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) { + unsigned NumElt = VT.getVectorNumElements(); + unsigned NumSrcElt = SrcVT.getVectorNumElements(); + assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types"); - if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) || - (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) && - (Start == 0 || Start == 4)) - return Op; + // We have some TableGen patterns for when the extracted vector is exactly + // the low or high half of the operand. + if ((NumSrcElt == 2 * NumElt) && (Start == 0 || Start == NumElt)) + return Op; - if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) || - (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) && - (Start == 0 || Start == 8)) - return Op; + // Extract 32-bit registers at a time. + EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2); + EVT NewVT = NumElt == 2 + ? MVT::i32 + : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2); + SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0)); + + DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2); + if (NumElt == 2) + Tmp = Args[0]; + else + Tmp = DAG.getBuildVector(NewVT, SL, Args); + + return DAG.getNode(ISD::BITCAST, SL, VT, Tmp); + } DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); - return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); + return DAG.getBuildVector(Op.getValueType(), SL, Args); } // TODO: Handle fabs too diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5762,6 +5762,35 @@ unsigned IdxVal = cast(Idx)->getZExtValue(); SDLoc SL(Op); + if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { + // Insert 32-bit registers at a time. + assert(InsNumElts % 2 == 0 && "expect legal vector types"); + + unsigned VecNumElts = VecVT.getVectorNumElements(); + EVT NewVecVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2); + EVT NewInsVT = InsNumElts == 2 ? MVT::i32 + : EVT::getVectorVT(*DAG.getContext(), + MVT::i32, InsNumElts / 2); + + Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec); + Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins); + + for (unsigned I = 0; I != InsNumElts / 2; ++I) { + SDValue Elt; + if (InsNumElts == 2) { + Elt = Ins; + } else { + Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins, + DAG.getConstant(I, SL, MVT::i32)); + } + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt, + DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32)); + } + + return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec); + } + for (unsigned I = 0; I != InsNumElts; ++I) { SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, DAG.getConstant(I, SL, MVT::i32)); diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -947,16 +947,7 @@ ; GFX9-NEXT: v_lshl_add_u32 v2, v1, 5, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v2 -; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4 -; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4 -; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v5 -; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v5 -; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %idx = shl i32 %idxp, 4