Index: llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -349,6 +349,89 @@ return false; } +// Demanded elements are those that are covered by DMask, without the +// zeros at the end of the passed vector. +static APInt findDemandedElts(Instruction *I, unsigned DMaskVal = 15) { + unsigned VWidth = cast(I->getType())->getNumElements(); + Instruction *CurInst = I; + Value *FirstArg = CurInst; + + // ValueComponent contains all distinct components. + SmallVector ValueComponents(VWidth, nullptr); + // Components contains the positions of the set components. + SmallVector Components(VWidth, -1); + int LastElem = 0; + + while (!isa(FirstArg)) { + CurInst = dyn_cast(FirstArg); + if (!CurInst) { + if (auto *ConstVec = dyn_cast(FirstArg)) { + // Components[i] == -1 iff the value was set to 0 or wasn't set. + for (unsigned i = 0; i < ConstVec->getNumOperands(); ++i) { + if (Components[i] != -1) + continue; + + auto *ConstFP = cast(ConstVec->getOperand(i)); + // Map all non-zero constants to -2 in Components vector. + if (!ConstFP->isExactlyValue(0)) + Components[i] = -2; + } + break; + } + } + if (auto *IEI = dyn_cast(CurInst)) { + auto *IEIIndex = cast(IEI->getOperand(2)); + auto *Comp = IEI->getOperand(1); + unsigned CompIdx = IEIIndex->getZExtValue(); + + // Look for the current component in ValueComponent. + bool HasComp = false; + for (int i = 0; i < LastElem; ++i) { + if (Comp == ValueComponents[i]) { + HasComp = true; + Components[CompIdx] = i; + break; + } + } + if (!HasComp) { + ValueComponents[LastElem] = Comp; + Components[CompIdx] = LastElem++; + } + + FirstArg = CurInst->getOperand(0); + } else if (auto *Shuffle = dyn_cast(CurInst)) { + for (unsigned i = 0; i < VWidth; i++) { + unsigned MaskVal = Shuffle->getMaskValue(i); + if (MaskVal == -1u || MaskVal >= 2 * VWidth) + continue; + if (MaskVal < VWidth) + Components[i] = MaskVal; + if (MaskVal >= VWidth && MaskVal < 2 * VWidth) + Components[i] = MaskVal - VWidth; + } + break; + } + } + + APInt DemandedElts = APInt::getAllOnes(VWidth); + for (int i = VWidth - 1; i >= 0; --i) { + // If the component isn't set, is equal to zero or is set but DMask doesn't + // cover it, then don't include that component in the DemandedElts. + if (Components[i] == -1 || (Components[i] != -1 && !(DMaskVal & (1 << i)))) + DemandedElts.clearBit(i); + else + break; + } + + return DemandedElts; +} + +static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, + IntrinsicInst &II, + APInt DemandedElts, + int DMaskIdx = -1, + bool IsLoad = true); + std::optional GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Intrinsic::ID IID = II.getIntrinsicID(); @@ -1048,6 +1131,65 @@ return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); break; } + case Intrinsic::amdgcn_buffer_store: + case Intrinsic::amdgcn_buffer_store_format: + case Intrinsic::amdgcn_raw_buffer_store: + case Intrinsic::amdgcn_raw_buffer_store_format: + case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_struct_buffer_store: + case Intrinsic::amdgcn_struct_buffer_store_format: + case Intrinsic::amdgcn_struct_tbuffer_store: + case Intrinsic::amdgcn_tbuffer_store: { + if (II.getOperand(0)->getType()->isFloatingPointTy()) + break; + + auto *IIVTy = cast(II.getOperand(0)->getType()); + unsigned VWidth = IIVTy->getNumElements(); + APInt DemandedElts = APInt::getAllOnes(VWidth); + if (auto *SrcVec = dyn_cast(II.getArgOperand(0))) + DemandedElts = findDemandedElts(SrcVec); + + if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, -1, false)) + return IC.eraseInstFromFunction(II); + + break; + } + case Intrinsic::amdgcn_image_store_1d: + case Intrinsic::amdgcn_image_store_1darray: + case Intrinsic::amdgcn_image_store_2d: + case Intrinsic::amdgcn_image_store_2darray: + case Intrinsic::amdgcn_image_store_2darraymsaa: + case Intrinsic::amdgcn_image_store_2dmsaa: + case Intrinsic::amdgcn_image_store_3d: + case Intrinsic::amdgcn_image_store_cube: + case Intrinsic::amdgcn_image_store_mip_1d: + case Intrinsic::amdgcn_image_store_mip_1darray: + case Intrinsic::amdgcn_image_store_mip_2d: + case Intrinsic::amdgcn_image_store_mip_2darray: + case Intrinsic::amdgcn_image_store_mip_3d: + case Intrinsic::amdgcn_image_store_mip_cube: { + if (II.getOperand(0)->getType()->isFloatingPointTy()) + break; + + auto *IIVTy = cast(II.getOperand(0)->getType()); + unsigned VWidth = IIVTy->getNumElements(); + ConstantInt *DMask = cast(II.getArgOperand(1)); + unsigned DMaskVal = DMask->getZExtValue() & 0xf; + + // If DMask has a value that needs more than VWidth bits for its representation, + // that means that the intrinsic has already been optimized. + if (DMaskVal > (1u << VWidth)) + break; + + APInt DemandedElts = APInt::getAllOnes(VWidth); + if (auto *SrcVec = dyn_cast(II.getArgOperand(0))) + DemandedElts = findDemandedElts(SrcVec, DMaskVal); + + if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 1, false)) + return IC.eraseInstFromFunction(II); + + break; + } default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { @@ -1060,14 +1202,18 @@ /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. /// +/// The result of simplifying amdgcn image and buffer store intrinsics is updating +/// definitions of the first vector argument of the intrinsic, not Uses +/// of the result like image and buffer loads. /// Note: This only supports non-TFE/LWE image intrinsic calls; those have /// struct returns. static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, - int DMaskIdx = -1) { + int DMaskIdx, bool IsLoad) { - auto *IIVTy = cast(II.getType()); + auto *IIVTy = cast(IsLoad ? II.getType() + : II.getOperand(0)->getType()); unsigned VWidth = IIVTy->getNumElements(); if (VWidth == 1) return nullptr; @@ -1172,29 +1318,54 @@ (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); OverloadTys[0] = NewTy; + if (!IsLoad) { + SmallVector EltMask; + for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx) { + if (DMaskIdx < 0) { + // Buffer case. + if (DemandedElts[OrigStoreIdx]) + EltMask.push_back(OrigStoreIdx); + } else { + // Image case. + ConstantInt *DMask = cast(Args[DMaskIdx]); + unsigned DMaskVal = DMask->getZExtValue() & 0xf; + if (DMaskVal & (1 << OrigStoreIdx)) + EltMask.push_back(OrigStoreIdx); + } + } + + if (NewNumElts == 1) + Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]); + else + Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask); + } + Function *NewIntrin = Intrinsic::getDeclaration( II.getModule(), II.getIntrinsicID(), OverloadTys); CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); NewCall->takeName(&II); NewCall->copyMetadata(II); - if (NewNumElts == 1) { - return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall, - DemandedElts.countr_zero()); - } + if (IsLoad) { + if (NewNumElts == 1) + return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall, + DemandedElts.countr_zero()); + + SmallVector EltMask; + unsigned NewLoadIdx = 0; + for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { + if (!!DemandedElts[OrigLoadIdx]) + EltMask.push_back(NewLoadIdx++); + else + EltMask.push_back(NewNumElts); + } - SmallVector EltMask; - unsigned NewLoadIdx = 0; - for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { - if (!!DemandedElts[OrigLoadIdx]) - EltMask.push_back(NewLoadIdx++); - else - EltMask.push_back(NewNumElts); - } + auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); - Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); + return Shuffle; + } - return Shuffle; + return NewCall; } std::optional GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( Index: llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll =================================================================== --- llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -66,7 +66,7 @@ define float @test_constant_fold_rcp_f32_43_strictfp() nounwind strictfp { ; CHECK-LABEL: @test_constant_fold_rcp_f32_43_strictfp( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR14:[0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) #[[ATTR12:[0-9]+]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.rcp.f32(float 4.300000e+01) strictfp nounwind readnone @@ -107,7 +107,7 @@ define half @test_constant_fold_sqrt_f16_0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f16_0( -; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR15:[0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH0000) #[[ATTR13:[0-9]+]] ; CHECK-NEXT: ret half [[VAL]] ; %val = call half @llvm.amdgcn.sqrt.f16(half 0.0) nounwind readnone @@ -116,7 +116,7 @@ define float @test_constant_fold_sqrt_f32_0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f32_0( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float 0.000000e+00) #[[ATTR13]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.sqrt.f32(float 0.0) nounwind readnone @@ -125,7 +125,7 @@ define double @test_constant_fold_sqrt_f64_0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f64_0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double 0.000000e+00) #[[ATTR13]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.sqrt.f64(double 0.0) nounwind readnone @@ -134,7 +134,7 @@ define half @test_constant_fold_sqrt_f16_neg0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f16_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call half @llvm.amdgcn.sqrt.f16(half 0xH8000) #[[ATTR13]] ; CHECK-NEXT: ret half [[VAL]] ; %val = call half @llvm.amdgcn.sqrt.f16(half -0.0) nounwind readnone @@ -143,7 +143,7 @@ define float @test_constant_fold_sqrt_f32_neg0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f32_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.sqrt.f32(float -0.000000e+00) #[[ATTR13]] ; CHECK-NEXT: ret float [[VAL]] ; %val = call float @llvm.amdgcn.sqrt.f32(float -0.0) nounwind readnone @@ -152,7 +152,7 @@ define double @test_constant_fold_sqrt_f64_neg0() nounwind { ; CHECK-LABEL: @test_constant_fold_sqrt_f64_neg0( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR15]] +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.sqrt.f64(double -0.000000e+00) #[[ATTR13]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.sqrt.f64(double -0.0) nounwind readnone @@ -644,7 +644,7 @@ define i1 @test_class_isnan_f32_strict(float %x) nounwind { ; CHECK-LABEL: @test_class_isnan_f32_strict( -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 3) #[[ATTR16:[0-9]+]] +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 3) #[[ATTR14:[0-9]+]] ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 3) strictfp @@ -662,7 +662,7 @@ define i1 @test_class_is_p0_n0_f32_strict(float %x) nounwind { ; CHECK-LABEL: @test_class_is_p0_n0_f32_strict( -; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 96) #[[ATTR16]] +; CHECK-NEXT: [[VAL:%.*]] = call i1 @llvm.amdgcn.class.f32(float [[X:%.*]], i32 96) #[[ATTR14]] ; CHECK-NEXT: ret i1 [[VAL]] ; %val = call i1 @llvm.amdgcn.class.f32(float %x, i32 96) strictfp @@ -1275,8 +1275,8 @@ define i32 @ubfe_offset_0_width_3(i32 %src) { ; CHECK-LABEL: @ubfe_offset_0_width_3( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[SRC:%.*]], 7 -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[BFE:%.*]] = and i32 [[SRC:%.*]], 7 +; CHECK-NEXT: ret i32 [[BFE]] ; %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3) ret i32 %bfe @@ -1793,7 +1793,7 @@ define i64 @icmp_constant_inputs_true() { ; CHECK-LABEL: @icmp_constant_inputs_true( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR17:[0-9]+]] +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0:![0-9]+]]) #[[ATTR15:[0-9]+]] ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.icmp.i64.i32(i32 9, i32 8, i32 34) @@ -2500,7 +2500,7 @@ define i64 @fcmp_constant_inputs_true() { ; CHECK-LABEL: @fcmp_constant_inputs_true( -; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]] +; CHECK-NEXT: [[RESULT:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR15]] ; CHECK-NEXT: ret i64 [[RESULT]] ; %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float 2.0, float 4.0, i32 4) @@ -2542,7 +2542,7 @@ define i64 @ballot_one_64() { ; CHECK-LABEL: @ballot_one_64( -; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR17]] +; CHECK-NEXT: [[B:%.*]] = call i64 @llvm.read_register.i64(metadata [[META0]]) #[[ATTR15]] ; CHECK-NEXT: ret i64 [[B]] ; %b = call i64 @llvm.amdgcn.ballot.i64(i1 1) @@ -2568,7 +2568,7 @@ define i32 @ballot_one_32() { ; CHECK-LABEL: @ballot_one_32( -; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR17]] +; CHECK-NEXT: [[B:%.*]] = call i32 @llvm.read_register.i32(metadata [[META1:![0-9]+]]) #[[ATTR15]] ; CHECK-NEXT: ret i32 [[B]] ; %b = call i32 @llvm.amdgcn.ballot.i32(i1 1) @@ -4551,7 +4551,7 @@ define amdgpu_kernel void @store_mip_1d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { ; CHECK-LABEL: @store_mip_1d( ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 0, <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4562,7 +4562,7 @@ define amdgpu_kernel void @store_mip_2d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) { ; CHECK-LABEL: @store_mip_2d( ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 0, <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4573,7 +4573,7 @@ define amdgpu_kernel void @store_mip_3d(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) { ; CHECK-LABEL: @store_mip_3d( ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], i32 0, <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4584,7 +4584,7 @@ define amdgpu_kernel void @store_mip_1darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) { ; CHECK-LABEL: @store_mip_1darray( ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 0, <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4595,7 +4595,7 @@ define amdgpu_kernel void @store_mip_2darray(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) { ; CHECK-LABEL: @store_mip_2darray( ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], i32 0, <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -4606,7 +4606,7 @@ define amdgpu_kernel void @store_mip_cube(ptr addrspace(1) %out, <8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %u) { ; CHECK-LABEL: @store_mip_cube( ; CHECK-NEXT: main_body: -; CHECK-NEXT: call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; CHECK-NEXT: call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> [[VDATA:%.*]], i32 15, i32 [[S:%.*]], i32 [[T:%.*]], i32 [[U:%.*]], i32 0, <8 x i32> [[RSRC:%.*]], i32 0, i32 0) ; CHECK-NEXT: ret void ; main_body: @@ -5586,7 +5586,7 @@ define double @trig_preop_constfold_strictfp() { ; CHECK-LABEL: @trig_preop_constfold_strictfp( -; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR16]] +; CHECK-NEXT: [[VAL:%.*]] = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) #[[ATTR14]] ; CHECK-NEXT: ret double [[VAL]] ; %val = call double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) strictfp Index: llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mcpu=gfx1100 -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck -check-prefixes=GFX11 %s + +define amdgpu_ps void @image_store_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, i32 %s) #0 { +; GFX11-LABEL: @image_store_1d_store_insert_zeros_at_end( +; GFX11-NEXT: call void @llvm.amdgcn.image.store.1d.f32.i32(float [[VDATA1:%.*]], i32 1, i32 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFX11-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %newvdata4, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> inreg %rsrc, float %vdata1, float %vdata2, i32 %s, i32 %mip) #0 { +; GFX11-LABEL: @image_store_mip_1d_store_insert_zeros_at_end( +; GFX11-NEXT: [[TMP1:%.*]] = insertelement <3 x float> , float [[VDATA1:%.*]], i64 1 +; GFX11-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA2:%.*]], i64 2 +; GFX11-NEXT: call void @llvm.amdgcn.image.store.mip.1d.v3f32.i32(<3 x float> [[TMP2]], i32 7, i32 [[S:%.*]], i32 [[MIP:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0) +; GFX11-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float %vdata2, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %newvdata4, i32 7, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @buffer_store_insert_zeros_at_end(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GFX11-LABEL: @buffer_store_insert_zeros_at_end( +; GFX11-NEXT: [[TMP1:%.*]] = insertelement <2 x float> undef, float [[VDATA1:%.*]], i64 0 +; GFX11-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +; GFX11-NEXT: call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false) +; GFX11-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i1 0, i1 0) + ret void +} + +define amdgpu_ps void @struct_buffer_store_insert_zeros(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GFX11-LABEL: @struct_buffer_store_insert_zeros( +; GFX11-NEXT: [[TMP1:%.*]] = insertelement <3 x float> , float [[VDATA1:%.*]], i64 0 +; GFX11-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[VDATA1]], i64 2 +; GFX11-NEXT: call void @llvm.amdgcn.struct.buffer.store.v3f32(<3 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0) +; GFX11-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float %vdata1, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3 + call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0) + ret void +} + +define amdgpu_ps void @struct_tbuffer_store_insert_zeros_at_beginning(<4 x i32> inreg %a, float %vdata1, i32 %b) { +; GFX11-LABEL: @struct_tbuffer_store_insert_zeros_at_beginning( +; GFX11-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> , float [[VDATA1:%.*]], i64 3 +; GFX11-NEXT: call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i32 42, i32 0, i32 15) +; GFX11-NEXT: ret void +; + %newvdata1 = insertelement <4 x float> undef, float 0.0, i32 0 + %newvdata2 = insertelement <4 x float> %newvdata1, float 0.0, i32 1 + %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2 + %newvdata4 = insertelement <4 x float> %newvdata3, float %vdata1, i32 3 + call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i32 42, i32 0, i32 15) + ret void +} + +declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 +declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 +declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2 +declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2 +declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 +declare void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float>, i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind writeonly } +attributes #2 = { nounwind }