diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -789,6 +789,7 @@ ISD::AND, ISD::OR, ISD::XOR, + ISD::FSHR, ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FCANONICALIZE, @@ -10773,6 +10774,30 @@ return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); } + case ISD::FSHR: { + // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + auto ShiftOp = dyn_cast(Op->getOperand(2)); + if (!ShiftOp || Op.getValueType().isVector()) + return std::nullopt; + + uint64_t BitsProvided = Op.getValueSizeInBits(); + if (BitsProvided % 8 != 0) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided); + if (BitShift % 8) + return std::nullopt; + + uint64_t ConcatSizeInBytes = BitsProvided / 4; + uint64_t ByteShift = BitShift / 8; + + uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes; + uint64_t BytesProvided = BitsProvided / 8; + SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1); + NewIndex %= BytesProvided; + return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex); + } + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast(Op->getOperand(1)); @@ -11053,6 +11078,12 @@ SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src : *PermNodes[FirstSrc].Src; + // Check that we haven't just recreated the same FSHR node. + if (N->getOpcode() == ISD::FSHR && + (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) && + (N->getOperand(1) == Op || N->getOperand(1) == OtherOp)) + return SDValue(); + // Check that we are not just extracting the bytes in order from an op if (Op == OtherOp && Op.getValueSizeInBits() == 32) { int Low16 = PermMask & 0xffff; @@ -13061,6 +13092,14 @@ return performAndCombine(N, DCI); case ISD::OR: return performOrCombine(N, DCI); + case ISD::FSHR: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + if (N->getValueType(0) == MVT::i32 && N->isDivergent() && + TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { + return matchPERM(N, DCI); + } + break; + } case ISD::XOR: return performXorCombine(N, DCI); case ISD::ZERO_EXTEND: diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -1234,13 +1234,12 @@ ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_alignbit_b32 v0, v0, v10, 16 -; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: global_store_dword v[5:6], v1, off -; GFX10-NEXT: global_store_dword v[7:8], v0, off +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v10, v9, 0x2000706 +; GFX10-NEXT: global_store_dword v[5:6], v0, off +; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: ive_store_div: @@ -1256,18 +1255,18 @@ ; GFX9-NEXT: global_load_dword v10, v[2:3], off ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; GFX9-NEXT: s_mov_b32 s5, 0x2000706 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_sdwa v2, v10, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX9-NEXT: v_alignbit_b32 v2, v1, v10, 16 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_perm_b32 v3, v10, v9, s5 ; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v2, off +; GFX9-NEXT: global_store_dword v[7:8], v3, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x()