Index: llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -385,6 +385,9 @@ APInt DemandedElts = APInt::getAllOnes(VWidth); for (int i = VWidth - 1; i > 0; --i) { + // Don't call findScalarElement unless UseV has a definition. + auto *Elt = isa(UseV) ? findScalarElement(UseV, i) : UseV; + APInt DemandOneElt = APInt::getOneBitSet(VWidth, i); KnownFPClass KnownFPClass = computeKnownFPClass(UseV, DemandOneElt, IC.getDataLayout(), @@ -392,8 +395,9 @@ /*Depth=*/0, &IC.getTargetLibraryInfo(), &IC.getAssumptionCache(), I, &IC.getDominatorTree()); - if (KnownFPClass.KnownFPClasses != fcPosZero) + if (KnownFPClass.KnownFPClasses != fcPosZero && !isa(Elt)) break; + DemandedElts.clearBit(i); } return DemandedElts; Index: llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll =================================================================== --- llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll +++ llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll @@ -84,6 +84,19 @@ ret void } +define amdgpu_ps void @struct_tbuffer_store_insert_undefs(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GCN-LABEL: @struct_tbuffer_store_insert_undefs( +; GCN-NEXT: [[TMP1:%.*]] = insertelement <2 x float> , float [[VDATA1:%.*]], i64 0 +; GCN-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> [[TMP1]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GCN-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 1.0, i32 1 + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %newvdata2, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15) + ret void +} + + declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2