diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1423,32 +1423,42 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { - + SDLoc SL(Op); SmallVector Args; unsigned Start = cast(Op.getOperand(1))->getZExtValue(); EVT VT = Op.getValueType(); EVT SrcVT = Op.getOperand(0).getValueType(); - // For these types, we have some TableGen patterns except if the index is 1 - if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) || - (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) && - Start != 1) - return Op; + if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) { + unsigned NumElt = VT.getVectorNumElements(); + unsigned NumSrcElt = SrcVT.getVectorNumElements(); + assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types"); - if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) || - (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) && - (Start == 0 || Start == 4)) - return Op; + // We have some TableGen patterns for when the extracted vector is exactly + // the low or high half of the operand. + if ((NumSrcElt == 2 * NumElt) && (Start == 0 || Start == NumElt)) + return Op; - if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) || - (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) && - (Start == 0 || Start == 8)) - return Op; + // Extract 32-bit registers at a time. + EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2); + EVT NewVT = NumElt == 2 + ? MVT::i32 + : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2); + SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0)); + + DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2); + if (NumElt == 2) + Tmp = Args[0]; + else + Tmp = DAG.getBuildVector(NewVT, SL, Args); + + return DAG.getNode(ISD::BITCAST, SL, VT, Tmp); + } DAG.ExtractVectorElements(Op.getOperand(0), Args, Start, VT.getVectorNumElements()); - return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args); + return DAG.getBuildVector(Op.getValueType(), SL, Args); } // TODO: Handle fabs too diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5762,6 +5762,35 @@ unsigned IdxVal = cast(Idx)->getZExtValue(); SDLoc SL(Op); + if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) { + // Insert 32-bit registers at a time. + assert(InsNumElts % 2 == 0 && "expect legal vector types"); + + unsigned VecNumElts = VecVT.getVectorNumElements(); + EVT NewVecVT = + EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2); + EVT NewInsVT = InsNumElts == 2 ? MVT::i32 + : EVT::getVectorVT(*DAG.getContext(), + MVT::i32, InsNumElts / 2); + + Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec); + Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins); + + for (unsigned I = 0; I != InsNumElts / 2; ++I) { + SDValue Elt; + if (InsNumElts == 2) { + Elt = Ins; + } else { + Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins, + DAG.getConstant(I, SL, MVT::i32)); + } + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt, + DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32)); + } + + return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec); + } + for (unsigned I = 0; I != InsNumElts; ++I) { SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, DAG.getConstant(I, SL, MVT::i32)); diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -213,8 +213,6 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: .LBB4_3: ; %if.end -; GCN-NEXT: s_mov_b32 s4, 0xffff -; GCN-NEXT: v_bfi_b32 v0, s4, v0, v0 ; GCN-NEXT: global_store_short v[0:1], v1, off ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -947,16 +947,7 @@ ; GFX9-NEXT: v_lshl_add_u32 v2, v1, 5, v0 ; GFX9-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 ; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v2 -; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4 -; GFX9-NEXT: v_bfi_b32 v4, s4, v0, v4 -; GFX9-NEXT: v_bfi_b32 v5, s4, v2, v5 -; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v5 -; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] %idx = shl i32 %idxp, 4 diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll @@ -10,13 +10,12 @@ ; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_pack_lh_b32_b16 s4, s0, s0 ; GFX900-NEXT: v_mov_b32_e32 v5, s3 ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX900-NEXT: v_mov_b32_e32 v0, s0 ; GFX900-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 ; GFX900-NEXT: v_mov_b32_e32 v3, s0 -; GFX900-NEXT: v_mov_b32_e32 v0, s4 -; GFX900-NEXT: v_mov_b32_e32 v2, s4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX900-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX900-NEXT: s_endpgm @@ -26,13 +25,12 @@ ; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_lh_b32_b16 s4, s0, s0 ; GFX906-NEXT: v_mov_b32_e32 v5, s3 ; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX906-NEXT: v_mov_b32_e32 v0, s0 ; GFX906-NEXT: v_mov_b32_e32 v1, s1 +; GFX906-NEXT: v_mov_b32_e32 v2, s0 ; GFX906-NEXT: v_mov_b32_e32 v3, s0 -; GFX906-NEXT: v_mov_b32_e32 v0, s4 -; GFX906-NEXT: v_mov_b32_e32 v2, s4 ; GFX906-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX906-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX906-NEXT: s_endpgm @@ -42,13 +40,12 @@ ; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_pack_lh_b32_b16 s4, s0, s0 ; GFX908-NEXT: v_mov_b32_e32 v5, s3 ; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX908-NEXT: v_mov_b32_e32 v0, s0 ; GFX908-NEXT: v_mov_b32_e32 v1, s1 +; GFX908-NEXT: v_mov_b32_e32 v2, s0 ; GFX908-NEXT: v_mov_b32_e32 v3, s0 -; GFX908-NEXT: v_mov_b32_e32 v0, s4 -; GFX908-NEXT: v_mov_b32_e32 v2, s4 ; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX908-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX908-NEXT: s_endpgm @@ -58,13 +55,12 @@ ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_pack_lh_b32_b16 s4, s0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s0 -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX90A-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX90A-NEXT: s_endpgm