diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -562,6 +562,7 @@ SDValue TransformFPLoadStorePair(SDNode *N); SDValue convertBuildVecZextToZext(SDNode *N); SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); + SDValue reduceBuildVecTruncToBitCast(SDNode *N); SDValue reduceBuildVecToShuffle(SDNode *N); SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef VectorMask, SDValue VecIn1, @@ -17475,6 +17476,84 @@ return DAG.getBitcast(VT, BV); } +// Simplify (build_vec (trunc $1) +// (trunc (srl $1 half-width)) +// (trunc (srl $1 (2 * half-width))) …) +// to (bitcast $1) +SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) { + assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector"); + + // Only for little endian + if (!DAG.getDataLayout().isLittleEndian()) + return SDValue(); + + SDLoc DL(N); + EVT VT = N->getValueType(0); + EVT OutScalarTy = VT.getScalarType(); + uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits(); + + // Only for power of two types to be sure that bitcast works well + if (!isPowerOf2_64(ScalarTypeBitsize)) + return SDValue(); + + unsigned NumInScalars = N->getNumOperands(); + + // Look through bitcasts + auto PeekThroughBitcast = [](SDValue Op) { + if (Op.getOpcode() == ISD::BITCAST) + return Op.getOperand(0); + return Op; + }; + + // The source value where all the parts are extracted. + SDValue Src; + for (unsigned i = 0; i != NumInScalars; ++i) { + SDValue In = PeekThroughBitcast(N->getOperand(i)); + // Ignore undef inputs. + if (In.isUndef()) continue; + + if (In.getOpcode() != ISD::TRUNCATE) + return SDValue(); + + In = PeekThroughBitcast(In.getOperand(0)); + + if (In.getOpcode() != ISD::SRL) { + // For now only build_vec without shuffling, handle shifts here in the + // future. + if (i != 0) + return SDValue(); + + Src = In; + } else { + // In is SRL + SDValue part = PeekThroughBitcast(In.getOperand(0)); + + if (!Src) { + Src = part; + } else if (Src != part) { + // Vector parts do not stem from the same variable + return SDValue(); + } + + SDValue ShiftAmtVal = In.getOperand(1); + if (!isa(ShiftAmtVal)) + return SDValue(); + + uint64_t ShiftAmt = In.getNode()->getConstantOperandVal(1); + + // The extracted value is not extracted at the right position + if (ShiftAmt != i * ScalarTypeBitsize) + return SDValue(); + } + } + + // Only cast if the size is the same + if (Src.getValueType().getSizeInBits() != VT.getSizeInBits()) + return SDValue(); + + return DAG.getBitcast(VT, Src); +} + SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef VectorMask, SDValue VecIn1, SDValue VecIn2, @@ -18006,6 +18085,9 @@ if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) return V; + if (SDValue V = reduceBuildVecTruncToBitCast(N)) + return V; + if (SDValue V = reduceBuildVecToShuffle(N)) return V; diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -230,10 +230,9 @@ ; SI-NEXT: s_load_dword s2, s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s0, s2, 0xff ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -244,7 +243,6 @@ ; VI-NEXT: s_mov_b32 s7, 0xf000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, 0xff ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -47,12 +47,9 @@ ; GFX9-LABEL: shuffle_v4f16_u3u1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -66,8 +63,6 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -95,13 +90,11 @@ ; GFX9-LABEL: shuffle_v4f16_3uu7: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[2:3], off offset:4 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 +; GFX9-NEXT: global_load_dword v1, v[2:3], off offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -117,11 +110,11 @@ ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1