diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -385,17 +385,20 @@ APInt DemandedElts = APInt::getAllOnes(VWidth); for (int i = VWidth - 1; i > 0; --i) { - APInt DemandOneElt = APInt::getOneBitSet(VWidth, i); - KnownFPClass KnownFPClass = - computeKnownFPClass(UseV, DemandOneElt, IC.getDataLayout(), - /*InterestedClasses=*/fcAllFlags, - /*Depth=*/0, &IC.getTargetLibraryInfo(), - &IC.getAssumptionCache(), I, - &IC.getDominatorTree()); - if (KnownFPClass.KnownFPClasses != fcPosZero) + auto *Elt = findScalarElement(UseV, i); + if (!Elt) break; + + if (auto *ConstElt = dyn_cast(Elt)) { + if (!ConstElt->isNullValue() && !isa(Elt)) + break; + } else { + break; + } + DemandedElts.clearBit(i); } + return DemandedElts; } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll @@ -84,6 +84,19 @@ ret void } +define amdgpu_ps void @struct_tbuffer_store_insert_undefs(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GCN-LABEL: @struct_tbuffer_store_insert_undefs( +; GCN-NEXT: [[TMP1:%.*]] = insertelement <2 x float> , float [[VDATA1:%.*]], i64 0 +; GCN-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> poison, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 1.0, i32 1 + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %newvdata2, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15) + ret void +} + + declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2