Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -872,10 +872,12 @@ defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< "STORE", [], [AMDGPUArg], - [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>; + [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>, + AMDGPUImageDMaskIntrinsic; defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< "STORE_MIP", [], [AMDGPUArg], - [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>; + [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>, + AMDGPUImageDMaskIntrinsic; ////////////////////////////////////////////////////////////////////////// // MSAA intrinsics Index: llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -376,6 +376,36 @@ return false; } +// Trim all zero components from the end of the vector \p UseV and return +// an appropriate bitset with known elements. +static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, + Instruction *I) { + auto *VTy = cast(UseV->getType()); + unsigned VWidth = VTy->getNumElements(); + APInt DemandedElts = APInt::getAllOnes(VWidth); + + for (int i = VWidth - 1; i > 0; --i) { + APInt DemandOneElt = APInt::getOneBitSet(VWidth, i); + KnownFPClass KnownFPClass = + computeKnownFPClass(UseV, DemandOneElt, IC.getDataLayout(), + /*InterestedClasses=*/fcAllFlags, + /*Depth=*/0, &IC.getTargetLibraryInfo(), + &IC.getAssumptionCache(), I, + &IC.getDominatorTree(), + &IC.getOptimizationRemarkEmitter()); + if (KnownFPClass.KnownFPClasses != fcPosZero) + break; + DemandedElts.clearBit(i); + } + return DemandedElts; +} + +static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, + IntrinsicInst &II, + APInt DemandedElts, + int DMaskIdx = -1, + bool IsLoad = true); + std::optional GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -1035,26 +1065,62 @@ return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); break; } - default: { - if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = - AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { - return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); + case Intrinsic::amdgcn_buffer_store_format: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_struct_buffer_store_format: + case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_struct_tbuffer_store: + case Intrinsic::amdgcn_tbuffer_store: + case Intrinsic::amdgcn_image_store_1d: + case Intrinsic::amdgcn_image_store_1darray: + case Intrinsic::amdgcn_image_store_2d: + case Intrinsic::amdgcn_image_store_2darray: + case Intrinsic::amdgcn_image_store_2darraymsaa: + case Intrinsic::amdgcn_image_store_2dmsaa: + case Intrinsic::amdgcn_image_store_3d: + case Intrinsic::amdgcn_image_store_cube: + case Intrinsic::amdgcn_image_store_mip_1d: + case Intrinsic::amdgcn_image_store_mip_1darray: + case Intrinsic::amdgcn_image_store_mip_2d: + case Intrinsic::amdgcn_image_store_mip_2darray: + case Intrinsic::amdgcn_image_store_mip_3d: + case Intrinsic::amdgcn_image_store_mip_cube: { + if (!isa(II.getArgOperand(0)->getType())) + break; + + APInt DemandedElts = + trimTrailingZerosInVector(IC, II.getArgOperand(0), &II); + + int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1; + if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx, + false)) { + return IC.eraseInstFromFunction(II); } + + break; + } } + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { + return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); } return std::nullopt; } /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. /// +/// The result of simplifying amdgcn image and buffer store intrinsics is updating +/// definitions of the intrinsics vector argument, not Uses of the result like +/// image and buffer loads. /// Note: This only supports non-TFE/LWE image intrinsic calls; those have /// struct returns. static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, - int DMaskIdx = -1) { + int DMaskIdx, bool IsLoad) { - auto *IIVTy = cast(II.getType()); + auto *IIVTy = cast(IsLoad ? II.getType() + : II.getOperand(0)->getType()); unsigned VWidth = IIVTy->getNumElements(); if (VWidth == 1) return nullptr; @@ -1125,13 +1191,13 @@ DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1; unsigned NewDMaskVal = 0; - unsigned OrigLoadIdx = 0; + unsigned OrigLdStIdx = 0; for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { const unsigned Bit = 1 << SrcIdx; if (!!(DMaskVal & Bit)) { - if (!!DemandedElts[OrigLoadIdx]) + if (!!DemandedElts[OrigLdStIdx]) NewDMaskVal |= Bit; - OrigLoadIdx++; + OrigLdStIdx++; } } @@ -1159,29 +1225,45 @@ (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); OverloadTys[0] = NewTy; + if (!IsLoad) { + SmallVector EltMask; + for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx) + if (DemandedElts[OrigStoreIdx]) + EltMask.push_back(OrigStoreIdx); + + if (NewNumElts == 1) + Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]); + else + Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask); + } + Function *NewIntrin = Intrinsic::getDeclaration( II.getModule(), II.getIntrinsicID(), OverloadTys); CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); NewCall->takeName(&II); NewCall->copyMetadata(II); - if (NewNumElts == 1) { - return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall, - DemandedElts.countr_zero()); - } + if (IsLoad) { + if (NewNumElts == 1) { + return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall, + DemandedElts.countr_zero()); + } - SmallVector EltMask; - unsigned NewLoadIdx = 0; - for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { - if (!!DemandedElts[OrigLoadIdx]) - EltMask.push_back(NewLoadIdx++); - else - EltMask.push_back(NewNumElts); - } + SmallVector EltMask; + unsigned NewLoadIdx = 0; + for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { + if (!!DemandedElts[OrigLoadIdx]) + EltMask.push_back(NewLoadIdx++); + else + EltMask.push_back(NewNumElts); + } + + auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); - Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); + return Shuffle; + } - return Shuffle; + return NewCall; } std::optional GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( Index: llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mcpu=gfx900 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s +; RUN: opt -mcpu=gfx1010 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s +; RUN: opt -mcpu=gfx1100 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GCN %s + +define amdgpu_ps void @image_store_1d_store_insert_all_zeros(<8 x i32> inreg %rsrc, i32 %s) #0 { +; GCN-LABEL: @image_store_1d_store_insert_all_zeros( +; GCN-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float 0.000000e+00, i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %newvdata4, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, i32 %s) #0 { +; GCN-LABEL: @image_store_1d_store_insert_zeros_at_end( +; GCN-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %newvdata4, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, float %vdata2, i32 %s, i32 %mip) #0 { +; GCN-LABEL: @image_store_mip_1d_store_insert_zeros_at_end( +; GCN-NEXT: [[TMP1:%.*]] = insertelement <3 x float> , float [[VDATA1:%.*]], i64 1 +; GCN-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2 +; GCN-NEXT: call void @llvm.amdgcn.image.store.1d.v3f32.i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float %vdata2, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %newvdata4, i32 7, i32 %s, i32 0, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GCN-LABEL: @buffer_store_format_insert_zeros_at_end( +; GCN-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[VDATA1:%.*]], i64 0 +; GCN-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +; GCN-NEXT: call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i1 0, i1 0) + ret void +} + +define amdgpu_ps void @struct_buffer_store_format_insert_zeros(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GCN-LABEL: @struct_buffer_store_format_insert_zeros( +; GCN-NEXT: [[TMP1:%.*]] = insertelement <3 x float> , float [[VDATA1:%.*]], i64 0 +; GCN-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA1]], i64 2 +; GCN-NEXT: call void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float %vdata1, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0) + ret void +} + +define amdgpu_ps void @struct_tbuffer_store_insert_zeros_at_beginning(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GCN-LABEL: @struct_tbuffer_store_insert_zeros_at_beginning( +; GCN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 3 +; GCN-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float %vdata1, i32 3 + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 +declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2 +declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind writeonly } +attributes #2 = { nounwind }