diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -852,6 +852,20 @@ return getBooleanContents(Type.isVector(), Type.isFloatingPoint()); } + /// Promote the given target boolean to a target boolean of the given type. + /// A target boolean is an integer value, not necessarily of type i1, the bits + /// of which conform to getBooleanContents. + /// + /// ValVT is the type of values that produced the boolean. + SDValue promoteTargetBoolean(SelectionDAG &DAG, SDValue Bool, + EVT ValVT) const { + SDLoc dl(Bool); + EVT BoolVT = + getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ValVT); + ISD::NodeType ExtendCode = getExtendForContent(getBooleanContents(ValVT)); + return DAG.getNode(ExtendCode, dl, BoolVT, Bool); + } + /// Return target scheduling preference. Sched::Preference getSchedulingPreference() const { return SchedPreferenceInfo; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -10044,6 +10044,8 @@ MaskedStoreSDNode *MST = cast(N); SDValue Mask = MST->getMask(); SDValue Chain = MST->getChain(); + SDValue Value = MST->getValue(); + SDValue Ptr = MST->getBasePtr(); SDLoc DL(N); // Zap masked stores with a zero mask. @@ -10063,6 +10065,42 @@ if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); + if (MST->isTruncatingStore() && MST->isUnindexed() && + Value.getValueType().isInteger() && + (!isa(Value) || + !cast(Value)->isOpaque())) { + APInt TruncDemandedBits = + APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), + MST->getMemoryVT().getScalarSizeInBits()); + + // See if we can simplify the operation with + // SimplifyDemandedBits, which only works if the value has a single use. + if (SimplifyDemandedBits(Value, TruncDemandedBits)) { + // Re-visit the store if anything changed and the store hasn't been merged + // with another node (N is deleted) SimplifyDemandedBits will add Value's + // node back to the worklist if necessary, but we also need to re-visit + // the Store node itself. + if (N->getOpcode() != ISD::DELETED_NODE) + AddToWorklist(N); + return SDValue(N, 0); + } + } + + // If this is a TRUNC followed by a masked store, fold this into a masked + // truncating store. We can do this even if this is already a masked + // truncstore. + if ((Value.getOpcode() == ISD::TRUNCATE) && Value.getNode()->hasOneUse() && + MST->isUnindexed() && + TLI.canCombineTruncStore(Value.getOperand(0).getValueType(), + MST->getMemoryVT(), LegalOperations)) { + auto Mask = TLI.promoteTargetBoolean(DAG, MST->getMask(), + Value.getOperand(0).getValueType()); + return DAG.getMaskedStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, + MST->getOffset(), Mask, MST->getMemoryVT(), + MST->getMemOperand(), MST->getAddressingMode(), + /*IsTruncating=*/true); + } + return SDValue(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp @@ -1007,11 +1007,7 @@ /// /// ValVT is the type of values that produced the boolean. SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) { - SDLoc dl(Bool); - EVT BoolVT = getSetCCResultType(ValVT); - ISD::NodeType ExtendCode = - TargetLowering::getExtendForContent(TLI.getBooleanContents(ValVT)); - return DAG.getNode(ExtendCode, dl, BoolVT, Bool); + return TLI.promoteTargetBoolean(DAG, Bool, ValVT); } /// Return the lower LoVT bits of Op in Lo and the upper HiVT bits in Hi. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18897,10 +18897,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE( SDValue Op, SelectionDAG &DAG) const { - auto Store = cast(Op); - - if (Store->isTruncatingStore()) - return SDValue(); + auto *Store = cast(Op); SDLoc DL(Op); EVT VT = Store->getValue().getValueType(); diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -155,21 +155,13 @@ define void @masked_store_trunc_v8i64i8(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i8>* %dest) #0 { ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i8: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p0.b, vl8 -; VBITS_GE_512-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b -; VBITS_GE_512-NEXT: uzp1 z1.b, z1.b, z1.b -; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z1.b, #0 -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] -; VBITS_GE_512-NEXT: ret +; VBITS_GE_512-NEXT: ptrue p[[P0:[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d +; VBITS_GE_512-NEXT: st1b { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ret + %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp %mask = icmp eq <8 x i64> %a, %b @@ -179,21 +171,13 @@ } define void @masked_store_trunc_v8i64i16(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i16>* %dest) #0 { -; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p0.h, vl8 -; VBITS_GE_512-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z1.h, #0 -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: masked_store_trunc_v8i64i16: +; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d +; VBITS_GE_512-NEXT: st1h { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp %mask = icmp eq <8 x i64> %a, %b @@ -203,19 +187,13 @@ } define void @masked_store_trunc_v8i64i32(<8 x i64>* %ap, <8 x i64>* %bp, <8 x i32>* %dest) #0 { -; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d -; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s -; VBITS_GE_512-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: uzp1 z1.s, z1.s, z1.s -; VBITS_GE_512-NEXT: cmpne p0.s, p0/z, z1.s, #0 -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: masked_store_trunc_v8i64i32: +; VBITS_GE_512: ptrue p[[P0:[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].d, p[[P0]]/z, [[Z0]].d, [[Z1]].d +; VBITS_GE_512-NEXT: st1w { [[Z0]].d }, p[[P1]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %b = load <8 x i64>, <8 x i64>* %bp %mask = icmp eq <8 x i64> %a, %b @@ -225,21 +203,13 @@ } define void @masked_store_trunc_v16i32i8(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i8>* %dest) #0 { -; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p0.b, vl16 -; VBITS_GE_512-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b -; VBITS_GE_512-NEXT: uzp1 z1.b, z1.b, z1.b -; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z1.b, #0 -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: masked_store_trunc_v16i32i8: +; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 +; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s +; VBITS_GE_512-NEXT: st1b { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %b = load <16 x i32>, <16 x i32>* %bp %mask = icmp eq <16 x i32> %a, %b @@ -249,19 +219,13 @@ } define void @masked_store_trunc_v16i32i16(<16 x i32>* %ap, <16 x i32>* %bp, <16 x i16>* %dest) #0 { -; VBITS_GE_512-LABEL: masked_store_trunc_v16i32i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s -; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_512-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: uzp1 z1.h, z1.h, z1.h -; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z1.h, #0 -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: masked_store_trunc_v16i32i16: +; VBITS_GE_512: ptrue p[[P0:[0-9]+]].s, vl16 +; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].s, p[[P0]]/z, [[Z0]].s, [[Z1]].s +; VBITS_GE_512-NEXT: st1h { [[Z0]].s }, p[[P1]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %b = load <16 x i32>, <16 x i32>* %bp %mask = icmp eq <16 x i32> %a, %b @@ -271,19 +235,13 @@ } define void @masked_store_trunc_v32i16i8(<32 x i16>* %ap, <32 x i16>* %bp, <32 x i8>* %dest) #0 { -; VBITS_GE_512-LABEL: masked_store_trunc_v32i16i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h -; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b -; VBITS_GE_512-NEXT: mov z1.h, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: ptrue p0.b, vl32 -; VBITS_GE_512-NEXT: uzp1 z1.b, z1.b, z1.b -; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z1.b, #0 -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] -; VBITS_GE_512-NEXT: ret +; CHECK-LABEL: masked_store_trunc_v32i16i8: +; VBITS_GE_512: ptrue p[[P0:[0-9]+]].h, vl32 +; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpeq p[[P1:[0-9]+]].h, p[[P0]]/z, [[Z0]].h, [[Z1]].h +; VBITS_GE_512-NEXT: st1b { [[Z0]].h }, p[[P1]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ret %a = load <32 x i16>, <32 x i16>* %ap %b = load <32 x i16>, <32 x i16>* %bp %mask = icmp eq <32 x i16> %a, %b