diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10458,6 +10458,25 @@ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); } + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND_INREG: { + SDValue NarrowOp = Op->getOperand(0); + auto NarrowVT = NarrowOp.getValueType(); + if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { + auto *VTSign = cast(Op->getOperand(1)); + NarrowVT = VTSign->getVT(); + } + if (!NarrowVT.isByteSized()) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowVT.getStoreSize(); + + if (SrcIndex >= NarrowByteWidth) + return std::nullopt; + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast(Op->getOperand(1)); if (!ShiftOp) @@ -10497,7 +10516,8 @@ unsigned BitWidth = Op.getScalarValueSizeInBits(); if (BitWidth % 8 != 0) return std::nullopt; - assert(Index < BitWidth / 8 && "invalid index requested"); + if (Index > BitWidth / 8 - 1) + return std::nullopt; switch (Op.getOpcode()) { case ISD::OR: { @@ -10540,6 +10560,7 @@ return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); } + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast(Op->getOperand(1)); if (!ShiftOp) @@ -10586,9 +10607,18 @@ } case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: { + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND_INREG: + case ISD::AssertZext: + case ISD::AssertSext: { SDValue NarrowOp = Op->getOperand(0); - unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); + if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || + Op->getOpcode() == ISD::AssertZext || + Op->getOpcode() == ISD::AssertSext) { + auto *VTSign = cast(Op->getOperand(1)); + NarrowBitWidth = VTSign->getVT().getSizeInBits(); + } if (NarrowBitWidth % 8 != 0) return std::nullopt; uint64_t NarrowByteWidth = NarrowBitWidth / 8; @@ -10602,10 +10632,7 @@ } case ISD::TRUNCATE: { - unsigned NarrowBitWidth = Op.getScalarValueSizeInBits(); - if (NarrowBitWidth % 8 != 0) - return std::nullopt; - uint64_t NarrowByteWidth = NarrowBitWidth / 8; + uint64_t NarrowByteWidth = BitWidth / 8; if (NarrowByteWidth >= Index) { return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, @@ -10616,10 +10643,6 @@ } case ISD::CopyFromReg: { - auto BitWidth = Op.getScalarValueSizeInBits(); - if (BitWidth % 8) - llvm_unreachable("Invalid type in CopyFromReg"); - if (BitWidth / 8 > Index) return calculateSrcByte(Op, StartingIndex, Index); @@ -10653,6 +10676,23 @@ case ISD::BSWAP: return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1, Depth + 1, StartingIndex); + + case ISD::EXTRACT_VECTOR_ELT: { + auto IdxOp = dyn_cast(Op->getOperand(1)); + if (!IdxOp) + return std::nullopt; + auto VecIdx = IdxOp->getZExtValue(); + auto ScalarSize = Op.getScalarValueSizeInBits(); + if (ScalarSize != 32) { + if ((VecIdx + 1) * ScalarSize > 32) + return std::nullopt; + Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; + } + + return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0), + StartingIndex, Index); + } + default: { return std::nullopt; } @@ -10854,9 +10894,8 @@ std::optional> P = calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); // TODO support constantZero - if (!P || P->isConstantZero()) { + if (!P || P->isConstantZero()) return SDValue(); - } PermNodes.push_back(*P); } @@ -10892,7 +10931,7 @@ : *PermNodes[FirstSrc].Src; // Check that we are not just extracting the bytes in order from an op - if (Op == OtherOp) { + if (Op == OtherOp && Op.getValueSizeInBits() == 32) { int Low16 = PermMask & 0xffff; int Hi16 = (PermMask & 0xffff0000) >> 16; @@ -10901,13 +10940,19 @@ // The perm op would really just produce Op. So combine into Op if (WellFormedLow && WellFormedHi) - return Op; + return DAG.getBitcast(MVT::getIntegerVT(32), Op); } if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { SDLoc DL(N); assert(Op.getValueType().isByteSized() && OtherOp.getValueType().isByteSized()); + + // Handle potential vectors + Op = DAG.getBitcast(MVT::getIntegerVT(Op.getValueSizeInBits()), Op); + OtherOp = DAG.getBitcast( + MVT::getIntegerVT(OtherOp.getValueSizeInBits()), OtherOp); + if (Op.getValueSizeInBits() < 32) // If the ultimate src is less than 32 bits, then we will only be // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. @@ -10917,7 +10962,7 @@ Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op); if (OtherOp.getValueSizeInBits() < 32) - OtherOp = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); + OtherOp = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, OtherOp); return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, DAG.getConstant(PermMask, DL, MVT::i32)); diff --git a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll --- a/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll +++ b/llvm/test/CodeGen/AMDGPU/image-load-d16-tfe.ll @@ -544,11 +544,11 @@ ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1 ; GFX8-UNPACKED-NEXT: image_load v[1:4], v0, s[4:11] dmask:0x7 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0x1000504 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX8-UNPACKED-NEXT: v_perm_b32 v0, v1, v2, s0 ; GFX8-UNPACKED-NEXT: flat_store_short v[0:1], v3 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v0 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v4 @@ -644,12 +644,11 @@ ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v4, v1 ; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v5, v1 ; GFX8-UNPACKED-NEXT: image_load v[1:5], v0, s[4:11] dmask:0xf unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0x1000504 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[1:2] +; GFX8-UNPACKED-NEXT: v_perm_b32 v3, v3, v4, s0 +; GFX8-UNPACKED-NEXT: v_perm_b32 v2, v1, v2, s0 +; GFX8-UNPACKED-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) ; GFX8-UNPACKED-NEXT: flat_store_dword v[0:1], v5 ; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1692,18 +1692,17 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_perm_b32 v0, v0, s4, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1850,18 +1849,17 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_load_dword s4, s[4:5], 0x10 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x1000504 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; VI-NEXT: s_lshl_b32 s0, s4, 16 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_perm_b32 v1, v1, s4, v4 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -232,9 +232,9 @@ ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[0:2], v[0:2], s[0:7], s[8:11] dmask:0x7 d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v0, v1, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, v2 ; TONGA-NEXT: ; return to shader part epilog ; @@ -282,9 +282,9 @@ ; TONGA-NEXT: v_mov_b32_e32 v6, v3 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[3:6], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v3, v4, s0 ; TONGA-NEXT: v_mov_b32_e32 v1, v5 ; TONGA-NEXT: v_mov_b32_e32 v2, v6 ; TONGA-NEXT: ; return to shader part epilog @@ -368,11 +368,10 @@ ; TONGA-NEXT: s_wqm_b64 exec, exec ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; TONGA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; TONGA-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v0, v1, s0 +; TONGA-NEXT: v_perm_b32 v1, v2, v3, s0 ; TONGA-NEXT: ; return to shader part epilog ; ; GFX81-LABEL: image_sample_b_2d_v4f16: @@ -419,11 +418,10 @@ ; TONGA-NEXT: v_mov_b32_e32 v7, v3 ; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] ; TONGA-NEXT: image_sample_b v[3:7], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 +; TONGA-NEXT: s_mov_b32 s0, 0x1000504 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 -; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; TONGA-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; TONGA-NEXT: v_perm_b32 v0, v3, v4, s0 +; TONGA-NEXT: v_perm_b32 v1, v5, v6, s0 ; TONGA-NEXT: v_mov_b32_e32 v2, v7 ; TONGA-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -2751,3 +2751,218 @@ store i32 %result, ptr addrspace(1) undef ret void } + +define hidden void @extract3744(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: extract3744: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v6, v7, 0x3070404 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract3744: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x3070404 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v6, v7, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %v1e0 = extractelement <4 x i8> %vec1, i64 0 + %zv1e0 = zext i8 %v1e0 to i32 + %byte1 = shl i32 %zv1e0, 8 + + %v1e3 = extractelement <4 x i8> %vec1, i64 3 + %zv1e3 = zext i8 %v1e3 to i32 + %byte2 = shl i32 %zv1e3, 16 + %v2e3 = extractelement <4 x i8> %vec2, i64 3 + %zv2e3 = zext i8 %v2e3 to i32 + %byte3 = shl i32 %zv2e3, 24 + + %tmp0 = or i32 %zv1e0, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @extract1347_v2i16(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0) { +; GFX10-LABEL: extract1347_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v6, v[0:1], off +; GFX10-NEXT: global_load_dword v7, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_perm_b32 v0, v7, v6, 0x1030407 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extract1347_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v6, v[0:1], off +; GFX9-NEXT: global_load_dword v7, v[2:3], off +; GFX9-NEXT: s_mov_b32 s4, 0x1030407 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_perm_b32 v0, v7, v6, s4 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <2 x i16>, ptr addrspace(1) %in0, align 4 + %vec2 = load <2 x i16>, ptr addrspace(1) %in1, align 4 + %v1e0 = extractelement <2 x i16> %vec1, i64 0 + %v1e1 = extractelement <2 x i16> %vec1, i64 1 + %v2e0 = extractelement <2 x i16> %vec2, i64 0 + %v2e1 = extractelement <2 x i16> %vec2, i64 1 + + %b0t0 = and i16 -256, %v2e1 + %b0t1 = lshr i16 %b0t0, 8 + %byte0 = zext i16 %b0t1 to i32 + + %b1t0 = and i16 255, %v2e0 + %b1t1 = zext i16 %b1t0 to i32 + %byte1 = shl i32 %b1t1, 8 + + %b2t0 = and i16 -256, %v1e1 + %b2t1 = lshr i16 %b2t0, 8 + %b2t2 = zext i16 %b2t1 to i32 + %byte2 = shl i32 %b2t2, 16 + + %b3t0 = and i16 -256, %v1e0 + %b3t1 = lshr i16 %b3t0, 8 + %b3t2 = zext i16 %b3t1 to i32 + %byte3 = shl i32 %b3t2, 24 + + %tmp0 = or i32 %byte0, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +} + +define hidden void @shlbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i32 %base) { +; GFX10-LABEL: shlbase: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v7, v[0:1], off +; GFX10-NEXT: global_load_dword v8, v[2:3], off +; GFX10-NEXT: v_add_nc_u32_e32 v0, 16, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 24, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 8, v6 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, v3, v2 +; GFX10-NEXT: v_or3_b32 v0, v2, v0, v1 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: shlbase: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: global_load_dword v8, v[2:3], off +; GFX9-NEXT: v_add_u32_e32 v0, 8, v6 +; GFX9-NEXT: v_add_u32_e32 v1, 16, v6 +; GFX9-NEXT: v_add_u32_e32 v2, 24, v6 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_and_b32_e32 v3, 0xff, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_lshl_or_b32 v0, v3, v0, v3 +; GFX9-NEXT: v_or3_b32 v0, v0, v1, v2 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %v1e0 = extractelement <4 x i8> %vec1, i64 0 + %zv1e0 = zext i8 %v1e0 to i32 + %b8 = add i32 %base, 8 + %byte1 = shl i32 %zv1e0, %b8 + + %v1e3 = extractelement <4 x i8> %vec1, i64 3 + %zv1e3 = zext i8 %v1e3 to i32 + %b16 = add i32 %base, 16 + %byte2 = shl i32 %zv1e3, %b16 + %v2e3 = extractelement <4 x i8> %vec2, i64 3 + %zv2e3 = zext i8 %v2e3 to i32 + %b24 = add i32 %base, 24 + %byte3 = shl i32 %zv2e3, %b24 + + %tmp0 = or i32 %zv1e0, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +} + +; TODO -- lower into v_perm +define hidden void @extractbase(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ptr addrspace(1) %out0, i64 %base) { +; GFX10-LABEL: extractbase: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v7, v[0:1], off +; GFX10-NEXT: global_load_dword v8, v[2:3], off +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_bfe_u32 v2, v7, v1, 8 +; GFX10-NEXT: v_bfe_u32 v0, v7, v0, 8 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 8, v0 +; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-NEXT: global_store_dword v[4:5], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: extractbase: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v7, v[0:1], off +; GFX9-NEXT: global_load_dword v8, v[2:3], off +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v6 +; GFX9-NEXT: v_add_u32_e32 v1, 24, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_bfe_u32 v0, v7, v0, 8 +; GFX9-NEXT: v_bfe_u32 v2, v7, v1, 8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_sdwa v1, v1, v8 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 8, v0 +; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] + %vec1 = load <4 x i8>, ptr addrspace(1) %in0, align 4 + %vec2 = load <4 x i8>, ptr addrspace(1) %in1, align 4 + %v1b = extractelement <4 x i8> %vec1, i64 %base + %zv1b = zext i8 %v1b to i32 + %byte1 = shl i32 %zv1b, 8 + + %b3 = add i64 %base, 3 + %v1b3 = extractelement <4 x i8> %vec1, i64 %b3 + %zv1b3 = zext i8 %v1b3 to i32 + %byte2 = shl i32 %zv1b3, 16 + %v2b3 = extractelement <4 x i8> %vec2, i64 %b3 + %zv2b3 = zext i8 %v2b3 to i32 + %byte3 = shl i32 %zv2b3, 24 + + %tmp0 = or i32 %zv1b, %byte1 + %tmp1 = or i32 %tmp0, %byte2 + %res = or i32 %tmp1, %byte3 + store i32 %res, ptr addrspace(1) %out0, align 4 + ret void +}