diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10209,10 +10209,12 @@ if (Depth >= 6) return std::nullopt; + auto SrcSize = Op.getValueSizeInBits(); + if (SrcSize != 8 && SrcSize != 16 && SrcSize != 32) + return std::nullopt; + switch (Op->getOpcode()) { case ISD::TRUNCATE: { - if (Op->getOperand(0).getScalarValueSizeInBits() != 32) - return std::nullopt; return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); } @@ -10232,9 +10234,6 @@ } default: { - if (Op.getScalarValueSizeInBits() != 32) - return std::nullopt; - return ByteProvider::getSrc(Op, DestByte, SrcIndex); } } @@ -10376,6 +10375,17 @@ return std::nullopt; } + case ISD::CopyFromReg: { + auto BitWidth = Op.getScalarValueSizeInBits(); + if (BitWidth % 8) + return std::nullopt; + + if (BitWidth / 8 > Index) + return calculateSrcByte(Op, StartingIndex, Index); + + return std::nullopt; + } + case ISD::LOAD: { auto L = cast(Op.getNode()); unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); @@ -10459,11 +10469,21 @@ int Hi16 = (PermMask & 0xffff0000) >> 16; // ByteProvider only accepts 32 bit operands - assert(Op.getValueType().getSizeInBits() == 32); - assert(OtherOp.getValueType().getSizeInBits() == 32); - - auto OpIs16Bit = is16BitScalarOp(Op); - auto OtherOpIs16Bit = is16BitScalarOp(Op); + assert(!(Op.getValueType().getSizeInBits() % 8)); + assert(!(OtherOp.getValueType().getSizeInBits() % 8)); + + auto TempOp = Op.getOpcode() == ISD::BITCAST ? Op.getOperand(0) : Op; + auto TempOtherOp = + OtherOp.getOpcode() == ISD::BITCAST ? OtherOp.getOperand(0) : OtherOp; + + // Vectors of 16 bit ops should be counted as 16 bit ops. If they are cleanly + // addressed, then there are no Eight bit accesses + auto OpIs16Bit = TempOtherOp.getValueType().getSizeInBits() == 16 || + is16BitScalarOp(TempOp) || + TempOp.getScalarValueSizeInBits() == 16; + auto OtherOpIs16Bit = TempOtherOp.getValueType().getSizeInBits() == 16 || + is16BitScalarOp(TempOtherOp) || + TempOtherOp.getScalarValueSizeInBits() == 16; // If there is a size mismatch, then we must use masking on at least one // operand @@ -10471,8 +10491,9 @@ return true; // If both operands are 16 bit, return whether or not we cleanly address both - if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp)) + if (OpIs16Bit && OtherOpIs16Bit) { return !addresses16Bits(Low16) || !addresses16Bits(Hi16); + } // Both are 32 bit operands return true; @@ -10600,8 +10621,9 @@ std::optional> P = calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); // TODO support constantZero - if (!P || P->isConstantZero()) + if (!P || P->isConstantZero()) { return SDValue(); + } PermNodes.push_back(*P); } @@ -10621,10 +10643,10 @@ if (SecondSrc.has_value()) if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) return SDValue(); + // Set the index of the second distinct Src node SecondSrc = i; - assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() == - 32); + assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8)); SrcByteAdjust = 0; } assert(PermOp.SrcOffset + SrcByteAdjust < 8); @@ -10651,6 +10673,12 @@ if (hasEightBitAccesses(permMask, Op, OtherOp)) { SDLoc DL(N); + assert( + !(Op.getValueSizeInBits() % 8 || OtherOp.getValueSizeInBits() % 8)); + if (Op.getValueSizeInBits() < 32) + Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); + if (OtherOp.getValueSizeInBits() < 32) + OtherOp = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op); return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, DAG.getConstant(permMask, DL, MVT::i32)); } diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -1305,9 +1305,6 @@ ; GFX9-LABEL: test_ret_v3bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1315,9 +1312,7 @@ ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: ret <3 x bfloat> %in @@ -1850,9 +1845,6 @@ ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xffff -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 @@ -1890,11 +1882,9 @@ ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4 ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -682,12 +682,12 @@ ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_mov_b32 m0, -1 ; GFX803-NEXT: ds_read_u16 v0, v0 -; GFX803-NEXT: s_mov_b32 s4, 0x3020504 ; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: v_mov_b32_e32 v3, 0 ; GFX803-NEXT: ds_write_b16 v3, v2 ; GFX803-NEXT: s_waitcnt lgkmcnt(1) -; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: flat_store_dword v[0:1], v0 ; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX803-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -2809,3 +2809,41 @@ store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0 ret void } + +define void @Source16Bit(i16 %in, <2 x i16> %reg) { +; GFX10-LABEL: Source16Bit: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204 +; GFX10-NEXT: global_store_dword v[0:1], v0, off +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: Source16Bit: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_mov_b32 s4, 0x3050204 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: global_store_dword v[0:1], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +entry: + %elt0 = extractelement <2 x i16> %reg, i32 1 + %e0b0 = and i16 %elt0, 255 + %e0b1 = and i16 %elt0, -256 + %e1b0 = and i16 %in, 255 + %e1b1 = and i16 %in, -256 + %tmp0 = shl i16 %e0b0, 8 + %byte0 = or i16 %tmp0, %e1b0 + %tmp2 = lshr i16 %e1b1, 8 + %byte1 = or i16 %e0b1, %tmp2 + %ext0 = zext i16 %byte0 to i32 + %ext1 = zext i16 %byte1 to i32 + %shifted = shl i32 %ext1, 16 + %result = or i32 %shifted, %ext0 + store i32 %result, ptr addrspace(1) undef + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -150,8 +150,8 @@ ; VI-LABEL: trunc_v2i64_arg_to_v2i16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_mov_b32 s4, 0x1000504 +; VI-NEXT: v_perm_b32 v0, v0, v2, s4 ; VI-NEXT: s_setpc_b64 s[30:31] %trunc = trunc <2 x i64> %arg0 to <2 x i16> ret <2 x i16> %trunc