diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14792,8 +14792,8 @@ return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc); } -static inline int numVectorEltsOrZero(EVT T) { - return T.isVector() ? T.getVectorNumElements() : 0; +static inline ElementCount numVectorEltsOrZero(EVT T) { + return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0); } bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) { @@ -14861,6 +14861,24 @@ EVT STMemType = ST->getMemoryVT(); EVT STType = ST->getValue().getValueType(); + // There are two cases to consider here: + // 1. The store is fixed width and the load is scalable. In this case we + // don't know at compile time if the store completely envelops the load + // so we abandon the optimisation. + // 2. The store is scalable and the load is fixed width. We could + // potentially support a limited number of cases here, but there has been + // no cost-benefit analysis to prove it's worth it. + bool LdStScalable = LDMemType.isScalableVector(); + if (LdStScalable != STMemType.isScalableVector()) + return SDValue(); + + // If we are dealing with scalable vectors on a big endian platform the + // calculation of offsets below becomes trickier, since we do not know at + // compile time the absolute size of the vector. Until we've done more + // analysis on big-endian platforms it seems better to bail out for now. + if (LdStScalable && DAG.getDataLayout().isBigEndian()) + return SDValue(); + BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG); BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG); int64_t Offset; @@ -14872,13 +14890,21 @@ // the stored value). With Offset=n (for n > 0) the loaded value starts at the // n:th least significant byte of the stored value. if (DAG.getDataLayout().isBigEndian()) - Offset = ((int64_t)STMemType.getStoreSizeInBits() - - (int64_t)LDMemType.getStoreSizeInBits()) / 8 - Offset; + Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() - + (int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) / + 8 - + Offset; // Check that the stored value cover all bits that are loaded. - bool STCoversLD = - (Offset >= 0) && - (Offset * 8 + LDMemType.getSizeInBits() <= STMemType.getSizeInBits()); + bool STCoversLD; + + TypeSize LdMemSize = LDMemType.getSizeInBits(); + TypeSize StMemSize = STMemType.getSizeInBits(); + if (LdStScalable) + STCoversLD = (Offset == 0) && LdMemSize == StMemSize; + else + STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <= + StMemSize.getFixedSize()); auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue { if (LD->isIndexed()) { @@ -14899,15 +14925,15 @@ // Memory as copy space (potentially masked). if (Offset == 0 && LDType == STType && STMemType == LDMemType) { // Simple case: Direct non-truncating forwarding - if (LDType.getSizeInBits() == LDMemType.getSizeInBits()) + if (LDType.getSizeInBits() == LdMemSize) return ReplaceLd(LD, ST->getValue(), Chain); // Can we model the truncate and extension with an and mask? if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() && !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) { // Mask to size of LDMemType auto Mask = - DAG.getConstant(APInt::getLowBitsSet(STType.getSizeInBits(), - STMemType.getSizeInBits()), + DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(), + StMemSize.getFixedSize()), SDLoc(ST), STType); auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask); return ReplaceLd(LD, Val, Chain); diff --git a/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define @sti64ldi64(* nocapture %P, %v) { +; CHECK-LABEL: sti64ldi64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ret +entry: + %arrayidx0 = getelementptr inbounds , * %P, i64 1 + store %v, * %arrayidx0 + %arrayidx1 = getelementptr inbounds , * %P, i64 1 + %0 = load , * %arrayidx1 + ret %0 +} + +define @stf64ldf64(* nocapture %P, %v) { +; CHECK-LABEL: stf64ldf64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ret +entry: + %arrayidx0 = getelementptr inbounds , * %P, i64 1 + store %v, * %arrayidx0 + %arrayidx1 = getelementptr inbounds , * %P, i64 1 + %0 = load , * %arrayidx1 + ret %0 +} + +define @sti32ldi32ext(* nocapture %P, %v) { +; CHECK-LABEL: sti32ldi32ext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z1.d, p0/m, z0.d +; CHECK-NEXT: st1w { z0.d }, p0, [x0] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret +entry: + %0 = trunc %v to + store %0, * %P + %1 = load , * %P + %2 = sext %1 to + ret %2 +} + +define <2 x i64> @sti64ldfixedi64(* nocapture %P, %v) { +; CHECK-LABEL: sti64ldfixedi64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ldr q0, [x0, x8] +; CHECK-NEXT: ret +entry: + %arrayidx0 = getelementptr inbounds , * %P, i64 1 + store %v, * %arrayidx0 + %arrayidx1 = bitcast * %arrayidx0 to <2 x i64>* + %0 = load <2 x i64>, <2 x i64>* %arrayidx1 + ret <2 x i64> %0 +} + +define @sti64ldi32(* nocapture %P, %v) { +; CHECK-LABEL: sti64ldi32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret +entry: + %0 = bitcast * %P to * + %arrayidx0 = getelementptr inbounds , * %P, i64 1 + store %v, * %arrayidx0 + %arrayidx1 = getelementptr inbounds , * %0, i64 1 + %1 = load , * %arrayidx1 + ret %1 +} + +define @stf64ldi64(* nocapture %P, %v) { +; CHECK-LABEL: stf64ldi64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret +entry: + %0 = bitcast * %P to * + %arrayidx0 = getelementptr inbounds , * %P, i64 1 + store %v, * %arrayidx0 + %arrayidx1 = getelementptr inbounds , * %0, i64 1 + %1 = load , * %arrayidx1 + ret %1 +}