Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13213,6 +13213,10 @@ return false; } + // ldp/stp don't support scale + if (Ty->isSized() && DL.getTypeSizeInBits(Ty) > 64) + return false; + // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); Index: llvm/test/Transforms/LoopStrengthReduce/AArch64/issue53877.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopStrengthReduce/AArch64/issue53877.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s + +; Function Attrs: mustprogress nofree nosync nounwind uwtable +define dso_local void @convolution(ptr %src0, ptr %src1, i64 %stride_xm, i64 %stride_xp, ptr %dst, i32 %w) { +; CHECK-LABEL: convolution: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: .LBB0_1: // %do.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q0, [x0, x2] +; CHECK-NEXT: subs w5, w5, #1 +; CHECK-NEXT: ldr q1, [x0, x3] +; CHECK-NEXT: ldp q2, q3, [x0], #32 +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q4, [x1, x2] +; CHECK-NEXT: ldr q5, [x1, x3] +; CHECK-NEXT: ldp q6, q1, [x1], #32 +; CHECK-NEXT: fadd v2.4s, v2.4s, v3.4s +; CHECK-NEXT: fadd v3.4s, v4.4s, v5.4s +; CHECK-NEXT: fadd v1.4s, v6.4s, v1.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s +; CHECK-NEXT: fadd v1.4s, v3.4s, v1.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: str q0, [x4], #16 +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.2: // %do.end +; CHECK-NEXT: ret +entry: + br label %do.body + +do.body: ; preds = %do.body, %entry + %dst.addr.0 = phi ptr [ %dst, %entry ], [ %incdec.ptr, %do.body ] + %src1.addr.0 = phi ptr [ %src1, %entry ], [ %incdec.ptr2.i7, %do.body ] + %src0.addr.0 = phi ptr [ %src0, %entry ], [ %incdec.ptr2.i, %do.body ] + %w.addr.0 = phi i32 [ %w, %entry ], [ %dec, %do.body ] + %add.ptr.i = getelementptr inbounds i8, ptr %src0.addr.0, i64 %stride_xm + %0 = load <4 x float>, ptr %add.ptr.i, align 16 + %add.ptr1.i = getelementptr inbounds i8, ptr %src0.addr.0, i64 %stride_xp + %1 = load <4 x float>, ptr %add.ptr1.i, align 16 + %incdec.ptr.i = getelementptr inbounds <4 x float>, ptr %src0.addr.0, i64 1 + %2 = load <4 x float>, ptr %src0.addr.0, align 16 + %incdec.ptr2.i = getelementptr inbounds <4 x float>, ptr %src0.addr.0, i64 2 + %3 = load <4 x float>, ptr %incdec.ptr.i, align 16 + %add.i = fadd <4 x float> %0, %1 + %add3.i = fadd <4 x float> %2, %3 + %add4.i = fadd <4 x float> %add.i, %add3.i + %add.ptr.i4 = getelementptr inbounds i8, ptr %src1.addr.0, i64 %stride_xm + %4 = load <4 x float>, ptr %add.ptr.i4, align 16 + %add.ptr1.i5 = getelementptr inbounds i8, ptr %src1.addr.0, i64 %stride_xp + %5 = load <4 x float>, ptr %add.ptr1.i5, align 16 + %incdec.ptr.i6 = getelementptr inbounds <4 x float>, ptr %src1.addr.0, i64 1 + %6 = load <4 x float>, ptr %src1.addr.0, align 16 + %incdec.ptr2.i7 = getelementptr inbounds <4 x float>, ptr %src1.addr.0, i64 2 + %7 = load <4 x float>, ptr %incdec.ptr.i6, align 16 + %add.i8 = fadd <4 x float> %4, %5 + %add3.i9 = fadd <4 x float> %6, %7 + %add4.i10 = fadd <4 x float> %add.i8, %add3.i9 + %add = fadd <4 x float> %add4.i, %add4.i10 + %incdec.ptr = getelementptr inbounds <4 x float>, ptr %dst.addr.0, i64 1 + store <4 x float> %add, ptr %dst.addr.0, align 16 + %dec = add nsw i32 %w.addr.0, -1 + %tobool.not = icmp eq i32 %dec, 0 + br i1 %tobool.not, label %do.end, label %do.body + +do.end: ; preds = %do.body + ret void +}