diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -362,7 +362,6 @@ // * Single constant active lane -> store // * Adjacent vector addresses -> masked.store // * Narrow store width by halfs excluding zero/undef lanes -// * Vector splat address w/known mask -> scalar store // * Vector incrementing address -> vector masked store Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) { auto *ConstMask = dyn_cast(II.getArgOperand(3)); @@ -373,6 +372,36 @@ if (ConstMask->isNullValue()) return eraseInstFromFunction(II); + // Vector splat address -> scalar store + if (auto *SplatPtr = getSplatValue(II.getArgOperand(1))) { + // The value is a splat so we need a scalar store from the value to + // destination + if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) { + Align Alignment = cast(II.getArgOperand(2))->getAlignValue(); + StoreInst *S = new StoreInst(SplatValue, SplatPtr, false, Alignment); + S->copyMetadata(II); + return S; + } + // Vector splat address w/known mask -> scalar store + // If only the mask and destination are a splat, then we can extract + // the value from the last lane of the source and do a scalar store + // to destination + if (ConstMask->isAllOnesValue()) { + Align Alignment = cast(II.getArgOperand(2))->getAlignValue(); + VectorType *WideLoadTy = cast(II.getArgOperand(1)->getType()); + ElementCount VF = WideLoadTy->getElementCount(); + Constant *EC = + ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue()); + Value *RunTimeVF = VF.isScalable() ? Builder.CreateVScale(EC) : EC; + // LastLane = RunTimeVF - 1 + Value *LastLane = Builder.CreateSub(RunTimeVF, Builder.getInt32(1)); + Value *Extract = + Builder.CreateExtractElement(II.getArgOperand(0), LastLane); + StoreInst *S = new StoreInst(Extract, SplatPtr, false, Alignment); + S->copyMetadata(II); + return S; + } + } if (isa(ConstMask->getType())) return nullptr; diff --git a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll --- a/llvm/test/Transforms/InstCombine/masked_intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/masked_intrinsics.ll @@ -269,3 +269,105 @@ call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %valvec2, <2 x double*> %ptrs, i32 8, <2 x i1> ) ret void } + +;; Value splat but not all active mask +define void @valid_value_inv_store_i16(i16* noalias %dst, i16 %val) #0 { +; CHECK-LABEL: @valid_value_inv_store_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i16 [[VAL:%.*]], i16* [[DST:%.*]], align 2 +; CHECK-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement <4 x i16*> poison, i16* %dst, i32 0 + %broadcast.splat = shufflevector <4 x i16*> %broadcast.splatinsert, <4 x i16*> poison, <4 x i32> zeroinitializer + %broadcast.value = insertelement <4 x i16> poison, i16 %val, i32 0 + %broadcast.splatvalue = shufflevector <4 x i16> %broadcast.value, <4 x i16> poison, <4 x i32> zeroinitializer + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %broadcast.splatvalue, <4 x i16*> %broadcast.splat, i32 2, <4 x i1> shufflevector (<4 x i1> insertelement (<4 x i1> poison, i1 true, i32 1), <4 x i1> poison, <4 x i32> zeroinitializer)) + ret void +} + +;; Mask is not all zero, so should fold the splat value +define void @valid_2value_inv_store_i16(i16* noalias %dst, i16 %val) #0 { +; CHECK-LABEL: @valid_2value_inv_store_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i16 [[VAL:%.*]], i16* [[DST:%.*]], align 2 +; CHECK-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement <4 x i16*> poison, i16* %dst, i32 0 + %broadcast.splat = shufflevector <4 x i16*> %broadcast.splatinsert, <4 x i16*> poison, <4 x i32> zeroinitializer + %broadcast.value = insertelement <4 x i16> poison, i16 %val, i32 0 + %broadcast.splatvalue = shufflevector <4 x i16> %broadcast.value, <4 x i16> poison, <4 x i32> zeroinitializer + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %broadcast.splatvalue, <4 x i16*> %broadcast.splat, i32 2, <4 x i1> shufflevector (<4 x i1> insertelement (<4 x i1> poison, i1 false, i32 1), <4 x i1> poison, <4 x i32> zeroinitializer)) + ret void +} + + +;; Value splat but not all active mask +;; but the mask is all zero +define void @invalid_value_inv_store_i16(i16* noalias %dst, i16 %val) #0 { +; CHECK-LABEL: @invalid_value_inv_store_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement <4 x i16*> poison, i16* %dst, i32 0 + %broadcast.splat = shufflevector <4 x i16*> %broadcast.splatinsert, <4 x i16*> poison, <4 x i32> zeroinitializer + %broadcast.value = insertelement <4 x i16> poison, i16 %val, i32 0 + %broadcast.splatvalue = shufflevector <4 x i16> %broadcast.value, <4 x i16> poison, <4 x i32> zeroinitializer + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %broadcast.splatvalue, <4 x i16*> %broadcast.splat, i32 2, <4 x i1> shufflevector (<4 x i1> insertelement (<4 x i1> poison, i1 false, i32 0), <4 x i1> poison, <4 x i32> zeroinitializer)) + ret void +} + + +;; All one mask +define void @valid_maks_inv_store_i16(i16* noalias %dst, <4 x i16>* noalias readonly %src) #0 { +; CHECK-LABEL: @valid_maks_inv_store_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[SRC:%.*]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i64 3 +; CHECK-NEXT: store i16 [[TMP0]], i16* [[DST:%.*]], align 2 +; CHECK-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement <4 x i16*> poison, i16* %dst, i32 0 + %broadcast.splat = shufflevector <4 x i16*> %broadcast.splatinsert, <4 x i16*> poison, <4 x i32> zeroinitializer + %wide.load = load <4 x i16>, <4 x i16>* %src, align 2 + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %wide.load, <4 x i16*> %broadcast.splat, i32 2, <4 x i1> shufflevector (<4 x i1> insertelement (<4 x i1> poison, i1 true, i32 0), <4 x i1> poison, <4 x i32> zeroinitializer)) + ret void +} + +;; The destinatin address is not a splat +define void @invalid_addr_inv_store_i16(i16* noalias %dst, <4 x i16>* noalias readonly %src) #0 { +; CHECK-LABEL: @invalid_addr_inv_store_i16( +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[SRC:%.*]], align 2 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD]], <4 x i16*> undef, i32 2, <4 x i1> ) +; CHECK-NEXT: ret void +; + %insert.elt = insertelement <4 x i16*> poison, i16* %dst, i32 1 + %broadcast.splat = shufflevector <4 x i16*> %insert.elt, <4 x i16*> poison, <4 x i32> zeroinitializer + %wide.load = load <4 x i16>, <4 x i16>* %src, align 2 + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %wide.load, <4 x i16*> %broadcast.splat, i32 2, <4 x i1> shufflevector (<4 x i1> insertelement (<4 x i1> poison, i1 true, i32 0), <4 x i1> poison, <4 x i32> zeroinitializer)) + ret void +} + +;; The mask is not all 1 +define void @invalid_mask_inv_store_i16(i16* noalias %dst, <4 x i16>* noalias readonly %src) #0 { +; CHECK-LABEL: @invalid_mask_inv_store_i16( +; CHECK-NEXT: [[INSERT_ELT:%.*]] = insertelement <4 x i16*> poison, i16* [[DST:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16*> [[INSERT_ELT]], <4 x i16*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[SRC:%.*]], align 2 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> [[WIDE_LOAD]], <4 x i16*> [[BROADCAST_SPLAT]], i32 2, <4 x i1> poison) +; CHECK-NEXT: ret void +; + %insert.elt = insertelement <4 x i16*> poison, i16* %dst, i32 0 + %broadcast.splat = shufflevector <4 x i16*> %insert.elt, <4 x i16*> poison, <4 x i32> zeroinitializer + %wide.load = load <4 x i16>, <4 x i16>* %src, align 2 + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %wide.load, <4 x i16*> %broadcast.splat, i32 2, <4 x i1> shufflevector (<4 x i1> insertelement (<4 x i1> poison, i1 true, i32 1), <4 x i1> poison, <4 x i32> zeroinitializer)) + ret void +} + + + +; Function Attrs: nofree nosync nounwind willreturn writeonly +declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32 immarg, <4 x i1>) diff --git a/llvm/test/Transforms/InstCombine/vscale_masked_intrinsics.ll b/llvm/test/Transforms/InstCombine/vscale_masked_intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/vscale_masked_intrinsics.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -instcombine -S < %s | FileCheck %s + +;; Value splat but not all active mask +define void @valid_value_inv_store_i16(i16* noalias %dst, i16 %val) #0 { +; CHECK-LABEL: @valid_value_inv_store_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i16 [[VAL:%.*]], i16* [[DST:%.*]], align 2 +; CHECK-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement poison, i16* %dst, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + %broadcast.value = insertelement poison, i16 %val, i32 0 + %broadcast.splatvalue = shufflevector %broadcast.value, poison, zeroinitializer + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %broadcast.splatvalue, %broadcast.splat, i32 2, shufflevector ( insertelement ( poison, i1 true, i32 1), poison, zeroinitializer)) + ret void +} + +;; Mask is not all zero, so should fold the splat value +define void @valid_2value_inv_store_i16(i16* noalias %dst, i16 %val) #0 { +; CHECK-LABEL: @valid_2value_inv_store_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i16 [[VAL:%.*]], i16* [[DST:%.*]], align 2 +; CHECK-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement poison, i16* %dst, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + %broadcast.value = insertelement poison, i16 %val, i32 0 + %broadcast.splatvalue = shufflevector %broadcast.value, poison, zeroinitializer + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %broadcast.splatvalue, %broadcast.splat, i32 2, shufflevector ( insertelement ( poison, i1 false, i32 1), poison, zeroinitializer)) + ret void +} + + +;; Value splat but the mask is all zero +define void @invalid_value_inv_store_i16(i16* noalias %dst, i16 %val) #0 { +; CHECK-LABEL: @invalid_value_inv_store_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement poison, i16* %dst, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + %broadcast.value = insertelement poison, i16 %val, i32 0 + %broadcast.splatvalue = shufflevector %broadcast.value, poison, zeroinitializer + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %broadcast.splatvalue, %broadcast.splat, i32 2, shufflevector ( insertelement ( poison, i1 false, i32 0), poison, zeroinitializer)) + ret void +} + + +;; All one mask +define void @valid_maks_inv_store_i16(i16* noalias %dst, * noalias readonly %src) #0 { +; CHECK-LABEL: @valid_maks_inv_store_i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[SRC:%.*]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement [[WIDE_LOAD]], i32 [[TMP2]] +; CHECK-NEXT: store i16 [[TMP3]], i16* [[DST:%.*]], align 2 +; CHECK-NEXT: ret void +; +entry: + %broadcast.splatinsert = insertelement poison, i16* %dst, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer + %wide.load = load , * %src, align 2 + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %wide.load, %broadcast.splat, i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) + ret void +} + +;; The destinatin address is not a splat +define void @invalid_addr_inv_store_i16(i16* noalias %dst, * noalias readonly %src) #0 { +; CHECK-LABEL: @invalid_addr_inv_store_i16( +; CHECK-NEXT: [[INSERT_ELT:%.*]] = insertelement poison, i16* [[DST:%.*]], i64 1 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[INSERT_ELT]], poison, zeroinitializer +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[SRC:%.*]], align 2 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( [[WIDE_LOAD]], [[BROADCAST_SPLAT]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) +; CHECK-NEXT: ret void +; + %insert.elt = insertelement poison, i16* %dst, i32 1 + %broadcast.splat = shufflevector %insert.elt, poison, zeroinitializer + %wide.load = load , * %src, align 2 + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %wide.load, %broadcast.splat, i32 2, shufflevector ( insertelement ( poison, i1 true, i32 0), poison, zeroinitializer)) + ret void +} + +;; The mask is not all 1 +define void @invalid_mask_inv_store_i16(i16* noalias %dst, * noalias readonly %src) #0 { +; CHECK-LABEL: @invalid_mask_inv_store_i16( +; CHECK-NEXT: [[INSERT_ELT:%.*]] = insertelement poison, i16* [[DST:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[INSERT_ELT]], poison, zeroinitializer +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[SRC:%.*]], align 2 +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( [[WIDE_LOAD]], [[BROADCAST_SPLAT]], i32 2, shufflevector ( insertelement ( poison, i1 true, i32 1), poison, zeroinitializer)) +; CHECK-NEXT: ret void +; + %insert.elt = insertelement poison, i16* %dst, i32 0 + %broadcast.splat = shufflevector %insert.elt, poison, zeroinitializer + %wide.load = load , * %src, align 2 + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %wide.load, %broadcast.splat, i32 2, shufflevector ( insertelement ( poison, i1 true, i32 1), poison, zeroinitializer)) + ret void +} + + + +; Function Attrs: nofree nosync nounwind willreturn writeonly +declare void @llvm.masked.scatter.nxv4i16.nxv4p0i16(, , i32 immarg, ) + + +attributes #0 = { "target-features"="+sve,+sve" } +