diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10437,6 +10437,25 @@ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); } + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND_INREG: { + SDValue NarrowOp = Op->getOperand(0); + auto NarrowVT = NarrowOp.getValueType(); + if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { + auto *VTSign = cast(Op->getOperand(1)); + NarrowVT = VTSign->getVT(); + } + if (!NarrowVT.isByteSized()) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowVT.getStoreSize(); + + if (SrcIndex >= NarrowByteWidth) + return std::nullopt; + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast(Op->getOperand(1)); if (!ShiftOp) @@ -10476,7 +10495,8 @@ unsigned BitWidth = Op.getScalarValueSizeInBits(); if (BitWidth % 8 != 0) return std::nullopt; - assert(Index < BitWidth / 8 && "invalid index requested"); + if (Index > BitWidth / 8 - 1) + return std::nullopt; switch (Op.getOpcode()) { case ISD::OR: { @@ -10519,6 +10539,7 @@ return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); } + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast(Op->getOperand(1)); if (!ShiftOp) @@ -10565,9 +10586,18 @@ } case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: { + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND_INREG: + case ISD::AssertZext: + case ISD::AssertSext: { SDValue NarrowOp = Op->getOperand(0); - unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); + if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || + Op->getOpcode() == ISD::AssertZext || + Op->getOpcode() == ISD::AssertSext) { + auto *VTSign = cast(Op->getOperand(1)); + NarrowBitWidth = VTSign->getVT().getSizeInBits(); + } if (NarrowBitWidth % 8 != 0) return std::nullopt; uint64_t NarrowByteWidth = NarrowBitWidth / 8; @@ -10581,10 +10611,7 @@ } case ISD::TRUNCATE: { - unsigned NarrowBitWidth = Op.getScalarValueSizeInBits(); - if (NarrowBitWidth % 8 != 0) - return std::nullopt; - uint64_t NarrowByteWidth = NarrowBitWidth / 8; + uint64_t NarrowByteWidth = BitWidth / 8; if (NarrowByteWidth >= Index) { return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, @@ -10632,6 +10659,20 @@ case ISD::BSWAP: return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1, Depth + 1, StartingIndex); + + case ISD::EXTRACT_VECTOR_ELT: { + auto VecIdx = dyn_cast(Op->getOperand(1))->getZExtValue(); + auto ScalarSize = Op.getScalarValueSizeInBits(); + if (ScalarSize != 32) { + if ((VecIdx + 1) * ScalarSize > 32) + return std::nullopt; + Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; + } + + return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0), + StartingIndex, Index); + } + default: { return std::nullopt; } @@ -10833,9 +10874,8 @@ std::optional> P = calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); // TODO support constantZero - if (!P || P->isConstantZero()) { + if (!P || P->isConstantZero()) return SDValue(); - } PermNodes.push_back(*P); } @@ -10871,7 +10911,7 @@ : *PermNodes[FirstSrc].Src; // Check that we are not just extracting the bytes in order from an op - if (Op == OtherOp) { + if (Op == OtherOp && Op.getValueSizeInBits() == 32) { int Low16 = PermMask & 0xffff; int Hi16 = (PermMask & 0xffff0000) >> 16; @@ -10880,13 +10920,24 @@ // The perm op would really just produce Op. So combine into Op if (WellFormedLow && WellFormedHi) - return Op; + return Op.getValueType().isVector() + ? DAG.getBitcast(MVT::getIntegerVT(32), Op) + : Op; } if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { SDLoc DL(N); assert(Op.getValueType().isByteSized() && OtherOp.getValueType().isByteSized()); + + if (Op.getValueType().isVector()) { + Op = DAG.getBitcast(MVT::getIntegerVT(Op.getValueSizeInBits()), Op); + } + if (OtherOp.getValueType().isVector()) { + OtherOp = DAG.getBitcast( + MVT::getIntegerVT(OtherOp.getValueSizeInBits()), OtherOp); + } + if (Op.getValueSizeInBits() < 32) // If the ultimate src is less than 32 bits, then we will only be // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. @@ -10896,7 +10947,7 @@ Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op); if (OtherOp.getValueSizeInBits() < 32) - OtherOp = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); + OtherOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, OtherOp); return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, DAG.getConstant(PermMask, DL, MVT::i32)); diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll --- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -544,11 +544,11 @@ ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1 ; GFX8-UNPACKED-NEXT: image_load v[1:4], v0, s[4:11] dmask:0x7 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0x1000504 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX8-UNPACKED-NEXT: v_perm_b32 v0, v1, v2, s0 ; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v3 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v4 @@ -644,12 +644,11 @@ ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-UNPACKED-NEXT: image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0x1000504 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2] +; GFX8-UNPACKED-NEXT: v_perm_b32 v3, v3, v4, s0 +; GFX8-UNPACKED-NEXT: v_perm_b32 v2, v1, v2, s0 +; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1692,18 +1692,17 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_perm_b32 v0, v0, s4, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1850,18 +1849,17 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_perm_b32 v1, v1, s4, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -232,9 +232,9 @@ ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[0:2], v[0:2], s[0:7], s[8:11] dmask:0x7 d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v0, v1, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, v2 ; TONGA-NEXT: ; return to shader part epilog ; @@ -282,9 +282,9 @@ ; TONGA-NEXT: v_mov_b32_e32 v6, v3 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[3:6], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v3, v4, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, v5 ; TONGA-NEXT: v_mov_b32_e32 v2, v6 ; TONGA-NEXT: ; return to shader part epilog @@ -368,11 +368,10 @@ ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; TONGA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; TONGA-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v0, v1, s0 +; TONGA-NEXT: v_perm_b32 v1, v2, v3, s0 ; TONGA-NEXT: ; return to shader part epilog ; ; GFX81-LABEL: image_sample_b_2d_v4f16: @@ -419,11 +418,10 @@ ; TONGA-NEXT: v_mov_b32_e32 v7, v3 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[3:7], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; TONGA-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v3, v4, s0 +; TONGA-NEXT: v_perm_b32 v1, v5, v6, s0 ; TONGA-NEXT: v_mov_b32_e32 v2, v7 ; TONGA-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -2751,3 +2751,99 @@ store i32 %result, ptr addrspace(1) undef ret void } + +define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: extract3744: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract3744: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x3070404 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %v1e0 = extractelement <4 x i8> %vec1, i64 0 + %zv1e0 = zext i8 %v1e0 to i32 + %byte1 = shl i32 %zv1e0, 8 + + %v1e3 = extractelement <4 x i8> %vec1, i64 3 + %zv1e3 = zext i8 %v1e3 to i32 + %byte2 = shl i32 %zv1e3, 16 + %v2e3 = extractelement <4 x i8> %vec2, i64 3 + %zv2e3 = zext i8 %v2e3 to i32 + %byte3 = shl i32 %zv2e3, 24 + + %tmp0 = or i32 %zv1e0, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: extract1347_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1030407 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract1347_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x1030407 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 + %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 + %v1e0 = extractelement <2 x i16> %vec1, i64 0 + %v1e1 = extractelement <2 x i16> %vec1, i64 1 + %v2e0 = extractelement <2 x i16> %vec2, i64 0 + %v2e1 = extractelement <2 x i16> %vec2, i64 1 + + %b0t0 = and i16 -256, %v2e1 + %b0t1 = lshr i16 %b0t0, 8 + %byte0 = zext i16 %b0t1 to i32 + + %b1t0 = and i16 255, %v2e0 + %b1t1 = zext i16 %b1t0 to i32 + %byte1 = shl i32 %b1t1, 8 + + %b2t0 = and i16 -256, %v1e1 + %b2t1 = lshr i16 %b2t0, 8 + %b2t2 = zext i16 %b2t1 to i32 + %byte2 = shl i32 %b2t2, 16 + + %b3t0 = and i16 -256, %v1e0 + %b3t1 = lshr i16 %b3t0, 8 + %b3t2 = zext i16 %b3t1 to i32 + %byte3 = shl i32 %b3t2, 24 + + %tmp0 = or i32 %byte0, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +}