diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5695,12 +5695,13 @@ return getConstant(0, DL, VT); // Skip unnecessary zext_inreg pattern: - // (zext (trunc (assertzext x))) -> (assertzext x) - // TODO: Generalize to MaskedValueIsZero check? + // (zext (trunc x)) -> x iff the upper bits are known zero. + // TODO: Generalize to just the MaskedValueIsZero check? if (OpOpcode == ISD::TRUNCATE) { SDValue OpOp = N1.getOperand(0); if (OpOp.getValueType() == VT) { - if (OpOp.getOpcode() == ISD::AssertZext) { + if (OpOp.getOpcode() == ISD::AssertZext || + OpOp.getOpcode() == ISD::SRL) { APInt HiBits = APInt::getBitsSetFrom(VT.getScalarSizeInBits(), N1.getScalarValueSizeInBits()); if (MaskedValueIsZero(OpOp, HiBits)) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -784,6 +784,7 @@ ISD::AND, ISD::OR, ISD::XOR, + ISD::FSHR, ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FCANONICALIZE, @@ -10768,6 +10769,30 @@ return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); } + case ISD::FSHR: { + // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + auto ShiftOp = dyn_cast(Op->getOperand(2)); + if (!ShiftOp || Op.getValueType().isVector()) + return std::nullopt; + + uint64_t BitsProvided = Op.getValueSizeInBits(); + if (BitsProvided % 8 != 0) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided); + if (BitShift % 8) + return std::nullopt; + + uint64_t ConcatSizeInBytes = BitsProvided / 4; + uint64_t ByteShift = BitShift / 8; + + uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes; + uint64_t BytesProvided = BitsProvided / 8; + SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1); + NewIndex %= BytesProvided; + return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex); + } + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast(Op->getOperand(1)); @@ -10998,6 +11023,95 @@ return !addresses16Bits(Low16) || !addresses16Bits(Hi16); } +static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (VT != MVT::i32) + return SDValue(); + + // VT is known to be MVT::i32, so we need to provide 4 bytes. + SmallVector, 8> PermNodes; + for (int i = 0; i < 4; i++) { + // Find the ByteProvider that provides the ith byte of the result of OR + std::optional> P = + calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); + // TODO support constantZero + if (!P || P->isConstantZero()) + return SDValue(); + + PermNodes.push_back(*P); + } + if (PermNodes.size() != 4) + return SDValue(); + + int FirstSrc = 0; + std::optional SecondSrc; + uint64_t PermMask = 0x00000000; + for (size_t i = 0; i < PermNodes.size(); i++) { + auto PermOp = PermNodes[i]; + // Since the mask is applied to Src1:Src2, Src1 bytes must be offset + // by sizeof(Src2) = 4 + int SrcByteAdjust = 4; + + if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { + if (SecondSrc.has_value()) + if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) + return SDValue(); + + // Set the index of the second distinct Src node + SecondSrc = i; + assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8)); + SrcByteAdjust = 0; + } + assert(PermOp.SrcOffset + SrcByteAdjust < 8); + assert(!DAG.getDataLayout().isBigEndian()); + PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); + } + + SDValue Op = *PermNodes[FirstSrc].Src; + SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src + : *PermNodes[FirstSrc].Src; + + // Check that we haven't just recreated the same FSHR node. + if (N->getOpcode() == ISD::FSHR && + (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) && + (N->getOperand(1) == Op || N->getOperand(1) == OtherOp)) + return SDValue(); + + // Check that we are not just extracting the bytes in order from an op + if (Op == OtherOp && Op.getValueSizeInBits() == 32) { + int Low16 = PermMask & 0xffff; + int Hi16 = (PermMask & 0xffff0000) >> 16; + + bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); + bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); + + // The perm op would really just produce Op. So combine into Op + if (WellFormedLow && WellFormedHi) + return DAG.getBitcast(MVT::getIntegerVT(32), Op); + } + + if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { + SDLoc DL(N); + assert(Op.getValueType().isByteSized() && + OtherOp.getValueType().isByteSized()); + + // If the ultimate src is less than 32 bits, then we will only be + // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. + // CalculateByteProvider would not have returned Op as source if we + // used a byte that is outside its ValueType. Thus, we are free to + // ANY_EXTEND as the extended bits are dont-cares. + Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32); + OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32); + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, + DAG.getConstant(PermMask, DL, MVT::i32)); + } + + return SDValue(); +} + SDValue SITargetLowering::performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -11111,80 +11225,8 @@ } } if (LHSMask == ~0u || RHSMask == ~0u) { - SmallVector, 8> PermNodes; - - // VT is known to be MVT::i32, so we need to provide 4 bytes. - assert(VT == MVT::i32); - for (int i = 0; i < 4; i++) { - // Find the ByteProvider that provides the ith byte of the result of OR - std::optional> P = - calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); - // TODO support constantZero - if (!P || P->isConstantZero()) - return SDValue(); - - PermNodes.push_back(*P); - } - if (PermNodes.size() != 4) - return SDValue(); - - int FirstSrc = 0; - std::optional SecondSrc; - uint64_t PermMask = 0x00000000; - for (size_t i = 0; i < PermNodes.size(); i++) { - auto PermOp = PermNodes[i]; - // Since the mask is applied to Src1:Src2, Src1 bytes must be offset - // by sizeof(Src2) = 4 - int SrcByteAdjust = 4; - - if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { - if (SecondSrc.has_value()) - if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) - return SDValue(); - - // Set the index of the second distinct Src node - SecondSrc = i; - assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8)); - SrcByteAdjust = 0; - } - assert(PermOp.SrcOffset + SrcByteAdjust < 8); - assert(!DAG.getDataLayout().isBigEndian()); - PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); - } - - SDValue Op = *PermNodes[FirstSrc].Src; - SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src - : *PermNodes[FirstSrc].Src; - - // Check that we are not just extracting the bytes in order from an op - if (Op == OtherOp && Op.getValueSizeInBits() == 32) { - int Low16 = PermMask & 0xffff; - int Hi16 = (PermMask & 0xffff0000) >> 16; - - bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); - bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); - - // The perm op would really just produce Op. So combine into Op - if (WellFormedLow && WellFormedHi) - return DAG.getBitcast(MVT::getIntegerVT(32), Op); - } - - if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { - SDLoc DL(N); - assert(Op.getValueType().isByteSized() && - OtherOp.getValueType().isByteSized()); - - // If the ultimate src is less than 32 bits, then we will only be - // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. - // CalculateByteProvider would not have returned Op as source if we - // used a byte that is outside its ValueType. Thus, we are free to - // ANY_EXTEND as the extended bits are dont-cares. - Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32); - OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32); - - return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, - DAG.getConstant(PermMask, DL, MVT::i32)); - } + if (SDValue Perm = matchPERM(N, DCI)) + return Perm; } } @@ -13045,6 +13087,14 @@ return performAndCombine(N, DCI); case ISD::OR: return performOrCombine(N, DCI); + case ISD::FSHR: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + if (N->getValueType(0) == MVT::i32 && N->isDivergent() && + TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { + return matchPERM(N, DCI); + } + break; + } case ISD::XOR: return performXorCombine(N, DCI); case ISD::ZERO_EXTEND: diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1523,9 +1523,8 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s4, 16 -; VI-NEXT: v_cmp_ne_u16_e64 s[6:7], s5, 0 -; VI-NEXT: s_and_b64 vcc, exec, s[6:7] -; VI-NEXT: s_cbranch_vccz .LBB14_4 +; VI-NEXT: s_cmp_lg_u32 s5, 0 +; VI-NEXT: s_cbranch_scc0 .LBB14_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: s_mov_b32 s11, 0xf000 ; VI-NEXT: s_mov_b32 s10, -1 diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -353,7 +353,7 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_perm_b32 v0, v0, v0, 0x5040706 +; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 ; GFX10-NEXT: global_store_dword v[2:3], v0, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; @@ -361,9 +361,8 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_mov_b32 s4, 0x5040706 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_perm_b32 v0, v0, v0, s4 +; GFX9-NEXT: v_alignbit_b32 v0, v0, v0, 16 ; GFX9-NEXT: global_store_dword v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1234,13 +1233,12 @@ ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v10, 16 -; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dword v[5:6], v1, off -; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ive_store_div: @@ -1256,18 +1254,18 @@ ; GFX9-NEXT: global_load_dword v10, v[2:3], off ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: s_mov_b32 s5, 0x2000706 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_alignbit_b32 v2, v1, v10, 16 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v10, v9, s5 ; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v2, off +; GFX9-NEXT: global_store_dword v[7:8], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x()