diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3560,9 +3560,9 @@ auto Unmerge = B.buildUnmerge(S16, Reg); for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) PackedRegs.push_back(Unmerge.getReg(I)); - PackedRegs.resize(8, B.buildUndef(S16).getReg(0)); - Reg = B.buildBuildVector(LLT::vector(8, S16), PackedRegs).getReg(0); - return B.buildBitcast(LLT::vector(4, S32), Reg).getReg(0); + PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); + Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); + return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); } if (StoreVT.getNumElements() == 4) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -7456,17 +7456,6 @@ EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements); SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData); return DAG.UnrollVectorOp(ZExt.getNode()); - } else if (NumElements == 3) { - EVT IntStoreVT = - EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits()); - SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); - - EVT WidenedStoreVT = EVT::getVectorVT( - *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1); - EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(), - WidenedStoreVT.getStoreSizeInBits()); - SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData); - return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt); } // The sq block of gfx8.1 does not estimate register use correctly for d16 @@ -7489,9 +7478,17 @@ SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); PackedElts.push_back(IntPair); } + if ((NumElements % 2) == 1) { + // Handle v3i16 + unsigned I = Elts.size() / 2; + SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL, + {Elts[I * 2], DAG.getUNDEF(MVT::i16)}); + SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair); + PackedElts.push_back(IntPair); + } // Pad using UNDEF - PackedElts.resize(PackedElts.size() * 2, DAG.getUNDEF(MVT::i32)); + PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32)); // Build final vector EVT VecVT = @@ -7499,6 +7496,19 @@ return DAG.getBuildVector(VecVT, DL, PackedElts); } + if (NumElements == 3) { + EVT IntStoreVT = + EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits()); + SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData); + + EVT WidenedStoreVT = EVT::getVectorVT( + *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1); + EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(), + WidenedStoreVT.getStoreSizeInBits()); + SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData); + return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt); + } + assert(isTypeLegal(StoreVT)); return VData; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=UNPACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX81 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -stop-after=legalizer -global-isel-abort=0 -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) { ; PACKED-LABEL: name: image_store_f16 @@ -60,6 +62,44 @@ ; GFX81: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") ; GFX81: S_ENDPGM 0 + ; GFX9-LABEL: name: image_store_f16 + ; GFX9: bb.1 (%ir-block.0): + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") + ; GFX9: S_ENDPGM 0 + ; GFX10-LABEL: name: image_store_f16 + ; GFX10: bb.1 (%ir-block.0): + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY10]](s32) + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[TRUNC]](s16), 1, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 2 into custom "TargetCustom8") + ; GFX10: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.f16.i32(half %data, i32 1, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -128,6 +168,42 @@ ; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[BITCAST]](s32), [[DEF]](s32) ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<2 x s32>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") ; GFX81: S_ENDPGM 0 + ; GFX9-LABEL: name: image_store_v2f16 + ; GFX9: bb.1 (%ir-block.0): + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") + ; GFX9: S_ENDPGM 0 + ; GFX10-LABEL: name: image_store_v2f16 + ; GFX10: bb.1 (%ir-block.0): + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[COPY10]](<2 x s16>), 3, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 4 into custom "TargetCustom8") + ; GFX10: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %in, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -245,12 +321,78 @@ ; GFX81: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32) ; GFX81: [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]] ; GFX81: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32) - ; GFX81: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR2]](s32) - ; GFX81: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[COPY15]](s32) - ; GFX81: [[CONCAT_VECTORS1:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>) - ; GFX81: [[BITCAST5:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<8 x s16>) - ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST5]](<4 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) + ; GFX81: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>) + ; GFX81: [[BITCAST4:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BITCAST4]](<3 x s32>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) ; GFX81: S_ENDPGM 0 + ; GFX9-LABEL: name: image_store_v3f16 + ; GFX9: bb.1 (%ir-block.0): + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) + ; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) + ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; GFX9: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; GFX9: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX9: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) + ; GFX9: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX9: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX9: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF1]](s32) + ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX9: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) + ; GFX9: S_ENDPGM 0 + ; GFX10-LABEL: name: image_store_v3f16 + ; GFX10: bb.1 (%ir-block.0): + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[DEF:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[CONCAT_VECTORS]](<6 x s16>) + ; GFX10: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](s96) + ; GFX10: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32) + ; GFX10: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[C]](s32) + ; GFX10: [[COPY12:%[0-9]+]]:_(s32) = COPY [[UV]](s32) + ; GFX10: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; GFX10: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY12]](s32), [[COPY13]](s32) + ; GFX10: [[COPY14:%[0-9]+]]:_(s32) = COPY [[UV1]](s32) + ; GFX10: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; GFX10: [[BUILD_VECTOR_TRUNC1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY14]](s32), [[DEF1]](s32) + ; GFX10: [[CONCAT_VECTORS1:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[DEF]](<2 x s16>) + ; GFX10: [[UV3:%[0-9]+]]:_(<3 x s16>), [[UV4:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<6 x s16>) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[UV3]](<3 x s16>), 7, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 6 into custom "TargetCustom8", align 8) + ; GFX10: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } @@ -329,6 +471,46 @@ ; GFX81: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[DEF]](s32), [[DEF]](s32) ; GFX81: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[BUILD_VECTOR2]](<4 x s32>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") ; GFX81: S_ENDPGM 0 + ; GFX9-LABEL: name: image_store_v4f16 + ; GFX9: bb.1 (%ir-block.0): + ; GFX9: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX9: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX9: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) + ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX9: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") + ; GFX9: S_ENDPGM 0 + ; GFX10-LABEL: name: image_store_v4f16 + ; GFX10: bb.1 (%ir-block.0): + ; GFX10: liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; GFX10: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; GFX10: [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr5 + ; GFX10: [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr6 + ; GFX10: [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr7 + ; GFX10: [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr8 + ; GFX10: [[COPY7:%[0-9]+]]:_(s32) = COPY $sgpr9 + ; GFX10: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX10: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX10: [[COPY10:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2 + ; GFX10: [[COPY11:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3 + ; GFX10: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32) + ; GFX10: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY10]](<2 x s16>), [[COPY11]](<2 x s16>) + ; GFX10: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32) + ; GFX10: G_AMDGPU_INTRIN_IMAGE_STORE intrinsic(@llvm.amdgcn.image.store.2d), [[CONCAT_VECTORS]](<4 x s16>), 15, [[BUILD_VECTOR1]](<2 x s32>), $noreg, [[BUILD_VECTOR]](<8 x s32>), 0, 0, 0 :: (dereferenceable store 8 into custom "TargetCustom8") + ; GFX10: S_ENDPGM 0 call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %in, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -504,11 +504,95 @@ ret <2 x half> %v } -; FIXME: -; define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) { -; %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) -; ret <3 x half> %v -; } +define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_v3f16_xyz: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm d16 +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, 0xffff +; GFX8-UNPACKED-NEXT: s_and_b32 s1, s0, s0 +; GFX8-UNPACKED-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_and_b32_e32 v4, s0, v1 +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX8-UNPACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_v3f16_xyz: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16 +; GFX8-PACKED-NEXT: s_mov_b32 s0, 0xffff +; GFX8-PACKED-NEXT: s_and_b32 s0, s0, s0 +; GFX8-PACKED-NEXT: s_lshl_b32 s0, s0, 16 +; GFX8-PACKED-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-PACKED-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-PACKED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX8-PACKED-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_v3f16_xyz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 unorm d16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_and_or_b32 v1, v1, v2, s0 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v3f16_xyz: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX10-NEXT: ; return to shader part epilog + %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret <3 x half> %v +} define amdgpu_ps <4 x half> @load_1d_v4f16_xyzw(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-LABEL: load_1d_v4f16_xyzw: @@ -712,13 +796,72 @@ ret float %vv } -; FIXME: -; define amdgpu_ps float @load_1d_v3f16_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %s) { -; %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) -; %v.err = extractvalue { <3 x half>, i32 } %v, 1 -; %vv = bitcast i32 %v.err to float -; ret float %vv -; } +define amdgpu_ps float @load_1d_v3f16_tfe_dmask_xyz(<8 x i32> inreg %rsrc, i32 %s) { +; GFX8-UNPACKED-LABEL: load_1d_v3f16_tfe_dmask_xyz: +; GFX8-UNPACKED: ; %bb.0: +; GFX8-UNPACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-UNPACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-UNPACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-UNPACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-UNPACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-UNPACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-UNPACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-UNPACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-UNPACKED-NEXT: image_load v[0:3], v0, s[0:7] dmask:0x7 unorm tfe d16 +; GFX8-UNPACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-UNPACKED-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-UNPACKED-NEXT: ; return to shader part epilog +; +; GFX8-PACKED-LABEL: load_1d_v3f16_tfe_dmask_xyz: +; GFX8-PACKED: ; %bb.0: +; GFX8-PACKED-NEXT: s_mov_b32 s0, s2 +; GFX8-PACKED-NEXT: s_mov_b32 s1, s3 +; GFX8-PACKED-NEXT: s_mov_b32 s2, s4 +; GFX8-PACKED-NEXT: s_mov_b32 s3, s5 +; GFX8-PACKED-NEXT: s_mov_b32 s4, s6 +; GFX8-PACKED-NEXT: s_mov_b32 s5, s7 +; GFX8-PACKED-NEXT: s_mov_b32 s6, s8 +; GFX8-PACKED-NEXT: s_mov_b32 s7, s9 +; GFX8-PACKED-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm tfe d16 +; GFX8-PACKED-NEXT: s_waitcnt vmcnt(0) +; GFX8-PACKED-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-PACKED-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: load_1d_v3f16_tfe_dmask_xyz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: s_mov_b32 s1, s3 +; GFX9-NEXT: s_mov_b32 s2, s4 +; GFX9-NEXT: s_mov_b32 s3, s5 +; GFX9-NEXT: s_mov_b32 s4, s6 +; GFX9-NEXT: s_mov_b32 s5, s7 +; GFX9-NEXT: s_mov_b32 s6, s8 +; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 unorm tfe d16 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: load_1d_v3f16_tfe_dmask_xyz: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: s_mov_b32 s2, s4 +; GFX10-NEXT: s_mov_b32 s3, s5 +; GFX10-NEXT: s_mov_b32 s4, s6 +; GFX10-NEXT: s_mov_b32 s5, s7 +; GFX10-NEXT: s_mov_b32 s6, s8 +; GFX10-NEXT: s_mov_b32 s7, s9 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: image_load v[0:2], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v2 +; GFX10-NEXT: ; return to shader part epilog + %v = call { <3 x half>, i32 } @llvm.amdgcn.image.load.1d.sl_v3f16i32s.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) + %v.err = extractvalue { <3 x half>, i32 } %v, 1 + %vv = bitcast i32 %v.err to float + ret float %vv +} define amdgpu_ps float @load_1d_v4f16_tfe_dmask_xyzw(<8 x i32> inreg %rsrc, i32 %s) { ; GFX8-UNPACKED-LABEL: load_1d_v4f16_tfe_dmask_xyzw: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.d16.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -o - %s | FileCheck -check-prefix=UNPACKED %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -o - %s | FileCheck -check-prefix=GFX81 %s +; FIXME: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s +; FIXME: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10 %s define amdgpu_ps void @image_store_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, half %data) { ; UNPACKED-LABEL: image_store_f16: @@ -16,19 +18,6 @@ ; UNPACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 ; UNPACKED-NEXT: s_endpgm ; -; PACKED-LABEL: image_store_f16: -; PACKED: ; %bb.0: -; PACKED-NEXT: s_mov_b32 s0, s2 -; PACKED-NEXT: s_mov_b32 s1, s3 -; PACKED-NEXT: s_mov_b32 s2, s4 -; PACKED-NEXT: s_mov_b32 s3, s5 -; PACKED-NEXT: s_mov_b32 s4, s6 -; PACKED-NEXT: s_mov_b32 s5, s7 -; PACKED-NEXT: s_mov_b32 s6, s8 -; PACKED-NEXT: s_mov_b32 s7, s9 -; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x1 unorm d16 -; PACKED-NEXT: s_endpgm -; ; GFX81-LABEL: image_store_f16: ; GFX81: ; %bb.0: ; GFX81-NEXT: s_mov_b32 s0, s2 @@ -60,19 +49,6 @@ ; UNPACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16 ; UNPACKED-NEXT: s_endpgm ; -; PACKED-LABEL: image_store_v2f16: -; PACKED: ; %bb.0: -; PACKED-NEXT: s_mov_b32 s0, s2 -; PACKED-NEXT: s_mov_b32 s1, s3 -; PACKED-NEXT: s_mov_b32 s2, s4 -; PACKED-NEXT: s_mov_b32 s3, s5 -; PACKED-NEXT: s_mov_b32 s4, s6 -; PACKED-NEXT: s_mov_b32 s5, s7 -; PACKED-NEXT: s_mov_b32 s6, s8 -; PACKED-NEXT: s_mov_b32 s7, s9 -; PACKED-NEXT: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16 -; PACKED-NEXT: s_endpgm -; ; GFX81-LABEL: image_store_v2f16: ; GFX81: ; %bb.0: ; GFX81-NEXT: s_mov_b32 s0, s2 @@ -89,11 +65,44 @@ ret void } -; FIXME: Broken -; define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) { -; call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) -; ret void -; } +define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <3 x half> %in) { +; UNPACKED-LABEL: image_store_v3f16: +; UNPACKED: ; %bb.0: +; UNPACKED-NEXT: v_mov_b32_e32 v5, v1 +; UNPACKED-NEXT: v_mov_b32_e32 v1, v2 +; UNPACKED-NEXT: s_mov_b32 s0, s2 +; UNPACKED-NEXT: s_mov_b32 s1, s3 +; UNPACKED-NEXT: s_mov_b32 s2, s4 +; UNPACKED-NEXT: s_mov_b32 s3, s5 +; UNPACKED-NEXT: s_mov_b32 s4, s6 +; UNPACKED-NEXT: s_mov_b32 s5, s7 +; UNPACKED-NEXT: s_mov_b32 s6, s8 +; UNPACKED-NEXT: s_mov_b32 s7, s9 +; UNPACKED-NEXT: v_mov_b32_e32 v4, v0 +; UNPACKED-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; UNPACKED-NEXT: image_store v[1:3], v[4:5], s[0:7] dmask:0x7 unorm d16 +; UNPACKED-NEXT: s_endpgm +; +; GFX81-LABEL: image_store_v3f16: +; GFX81: ; %bb.0: +; GFX81-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX81-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX81-NEXT: s_mov_b32 s0, s2 +; GFX81-NEXT: s_mov_b32 s1, s3 +; GFX81-NEXT: s_mov_b32 s2, s4 +; GFX81-NEXT: s_mov_b32 s3, s5 +; GFX81-NEXT: s_mov_b32 s4, s6 +; GFX81-NEXT: s_mov_b32 s5, s7 +; GFX81-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX81-NEXT: s_mov_b32 s6, s8 +; GFX81-NEXT: s_mov_b32 s7, s9 +; GFX81-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX81-NEXT: v_mov_b32_e32 v4, 0 +; GFX81-NEXT: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16 +; GFX81-NEXT: s_endpgm + call void @llvm.amdgcn.image.store.2d.v3f16.i32(<3 x half> %in, i32 7, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <4 x half> %in) { ; UNPACKED-LABEL: image_store_v4f16: @@ -114,19 +123,6 @@ ; UNPACKED-NEXT: image_store v[1:4], v[5:6], s[0:7] dmask:0xf unorm d16 ; UNPACKED-NEXT: s_endpgm ; -; PACKED-LABEL: image_store_v4f16: -; PACKED: ; %bb.0: -; PACKED-NEXT: s_mov_b32 s0, s2 -; PACKED-NEXT: s_mov_b32 s1, s3 -; PACKED-NEXT: s_mov_b32 s2, s4 -; PACKED-NEXT: s_mov_b32 s3, s5 -; PACKED-NEXT: s_mov_b32 s4, s6 -; PACKED-NEXT: s_mov_b32 s5, s7 -; PACKED-NEXT: s_mov_b32 s6, s8 -; PACKED-NEXT: s_mov_b32 s7, s9 -; PACKED-NEXT: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16 -; PACKED-NEXT: s_endpgm -; ; GFX81-LABEL: image_store_v4f16: ; GFX81: ; %bb.0: ; GFX81-NEXT: s_mov_b32 s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll @@ -72,6 +72,12 @@ ret float %x } + +; GCN-LABEL: {{^}}image_load_3d_v3f16: +; UNPACKED: image_load v[0:2], v[0:2], s[0:7] dmask:0x7 unorm d16 +; PACKED: image_load v[0:1], v[0:2], s[0:7] dmask:0x7 unorm d16 +; GFX81: image_load v[0:1], v[0:2], s[0:7] dmask:0x7 unorm d16 +; GFX10: image_load v[0:1], v[0:2], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_3D unorm d16{{$}} define amdgpu_ps <2 x float> @image_load_3d_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { main_body: %tex = call <3 x half> @llvm.amdgcn.image.load.3d.v3f16.i32(i32 7, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -103,6 +109,11 @@ ret void } +; GCN-LABEL: {{^}}image_store_v3f16: +; UNPACKED: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16 +; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 unorm d16 +; GFX81: image_store v[2:4], v[0:1], s[0:7] dmask:0x7 unorm d16 +; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_2D unorm d16{{$}} define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) { main_body: %r = bitcast <2 x float> %in to <4 x half>