Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -277,6 +277,10 @@ int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp); + + TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, + ScalarEvolution *SE) const; + /// @} }; Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -9,6 +9,7 @@ #include "AArch64TargetTransformInfo.h" #include "AArch64ExpandImm.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" @@ -1278,3 +1279,16 @@ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } + +TTI::AddressingModeKind +AArch64TTIImpl::getPreferredAddressingMode(const Loop *L, + ScalarEvolution *SE) const { + // Pre-indexed addressing modes will generally introduce base address + // modifying instruction(s) into the preheader and is only really useful for + // unrolled loops, and we don't generally do when optimising for size. + if (L->getHeader()->getParent()->hasOptSize() || + L->getNumBlocks() != 1) + return TTI::AMK_None; + + return TTI::AMK_PreIndexed; +} Index: llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -133,20 +133,21 @@ define void @matrix_mul_double_shuffle(i32 %N, i32* nocapture %C, i16* nocapture readonly %A, i16 %val) { ; CHECK-LABEL: matrix_mul_double_shuffle: ; CHECK: // %bb.0: // %vector.header -; CHECK-NEXT: and w9, w3, #0xffff +; CHECK-NEXT: and w10, w3, #0xffff ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xfffffff8 -; CHECK-NEXT: dup v0.4h, w9 +; CHECK-NEXT: sub x9, x2, #16 // =16 +; CHECK-NEXT: dup v0.4h, w10 ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrh w9, [x2], #16 -; CHECK-NEXT: mov w10, w0 +; CHECK-NEXT: ldrh w10, [x9, #16]! +; CHECK-NEXT: mov w11, w0 ; CHECK-NEXT: subs x8, x8, #8 // =8 -; CHECK-NEXT: lsl x10, x10, #2 -; CHECK-NEXT: dup v1.4h, w9 +; CHECK-NEXT: lsl x11, x11, #2 +; CHECK-NEXT: dup v1.4h, w10 ; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-NEXT: add w0, w0, #8 // =8 -; CHECK-NEXT: str q1, [x1, x10] +; CHECK-NEXT: str q1, [x1, x11] ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %for.end12 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll +++ llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll @@ -12,7 +12,7 @@ br label %for.body for.body: ; preds = %for.body, %entry -; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ] +; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ -8, %entry ] ; Only one induction variable should have been generated. ; CHECK-NOT: phi %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ] @@ -20,7 +20,7 @@ %arrayidx = getelementptr inbounds double, double* %b, i64 %tmp %tmp1 = load double, double* %arrayidx, align 8 ; The induction variable should carry the scaling factor: 1 * 8 = 8. -; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8 +; CHECK: [[IVNEXT]] = add nsw i64 [[IV]], 8 %indvars.iv.next = add i64 %indvars.iv, 1 %arrayidx2 = getelementptr inbounds double, double* %c, i64 %indvars.iv.next %tmp2 = load double, double* %arrayidx2, align 8 @@ -28,8 +28,8 @@ %arrayidx4 = getelementptr inbounds double, double* %a, i64 %indvars.iv store double %mul, double* %arrayidx4, align 8 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 -; Comparison should be 19 * 8 = 152. -; CHECK: icmp eq i32 {{%[^,]+}}, 152 +; Comparison should be 19 * 8 - 8 = 144. +; CHECK: icmp eq i32 {{%[^,]+}}, 144 %exitcond = icmp eq i32 %lftr.wideiv, 20 br i1 %exitcond, label %for.end, label %for.body Index: llvm/test/CodeGen/AArch64/falkor-hwpf-fix.ll =================================================================== --- llvm/test/CodeGen/AArch64/falkor-hwpf-fix.ll +++ llvm/test/CodeGen/AArch64/falkor-hwpf-fix.ll @@ -3,9 +3,9 @@ ; Check that strided load tag collisions are avoided on Falkor. ; CHECK-LABEL: hwpf1: -; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE:[0-9]+]], #-16] +; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE:[0-9]+]]] ; CHECK: mov x[[BASE2:[0-9]+]], x[[BASE]] -; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE2]], #-8] +; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE2]], #8] ; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE3:[0-9]+]]] ; CHECK: mov x[[BASE4:[0-9]+]], x[[BASE3]] ; CHECK: ldp {{w[0-9]+}}, {{w[0-9]+}}, [x[[BASE4]], #8] Index: llvm/test/CodeGen/AArch64/pr27816.ll =================================================================== --- llvm/test/CodeGen/AArch64/pr27816.ll +++ llvm/test/CodeGen/AArch64/pr27816.ll @@ -7,9 +7,9 @@ ; CHECK-LABEL: @merge_const_store ; CHECK-NOT: strb -; CHECK: str x8, [x1] +; CHECK: str x9, [x8, #12]! ; CHECK-NOT: strb -; CHECK: str wzr, [x1, #8] +; CHECK: str wzr, [x8, #8] ; CHECK-NOT: strb define void @merge_const_store(i32 %count, %struct.A* nocapture %p) { %1 = icmp sgt i32 %count, 0 Index: llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll =================================================================== --- llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -8,12 +8,11 @@ define dso_local void @run_test() local_unnamed_addr #0 { ; CHECK-LABEL: run_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #80 // =80 -; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: .cfi_offset b8, -8 ; CHECK-NEXT: .cfi_offset b9, -16 ; CHECK-NEXT: .cfi_offset b10, -24 @@ -22,14 +21,14 @@ ; CHECK-NEXT: .cfi_offset b13, -48 ; CHECK-NEXT: .cfi_offset b14, -56 ; CHECK-NEXT: .cfi_offset b15, -64 -; CHECK-NEXT: adrp x10, B+48 -; CHECK-NEXT: adrp x11, A +; CHECK-NEXT: adrp x10, B +; CHECK-NEXT: add x10, x10, :lo12:B +; CHECK-NEXT: adrp x9, A+120 ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: add x10, x10, :lo12:B+48 -; CHECK-NEXT: add x11, x11, :lo12:A -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: add x9, x9, :lo12:A+120 +; CHECK-NEXT: sub x10, x10, #16 // =16 +; CHECK-NEXT: mov w11, #8 ; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: // implicit-def: $q2 ; CHECK-NEXT: // implicit-def: $q3 @@ -61,103 +60,96 @@ ; CHECK-NEXT: // implicit-def: $q13 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: mov x13, xzr +; CHECK-NEXT: ldr q14, [x13] ; CHECK-NEXT: ldr q15, [x8] -; CHECK-NEXT: ldr q14, [x12] -; CHECK-NEXT: ldr q0, [x10], #64 -; CHECK-NEXT: ldr x18, [x12] -; CHECK-NEXT: fmov x15, d15 -; CHECK-NEXT: mov x14, v15.d[1] -; CHECK-NEXT: fmov x13, d14 -; CHECK-NEXT: mul x1, x15, x18 -; CHECK-NEXT: mov x16, v0.d[1] -; CHECK-NEXT: fmov x17, d0 -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: mul x1, x14, x18 +; CHECK-NEXT: subs x11, x11, #1 // =1 ; CHECK-NEXT: mov x12, v14.d[1] -; CHECK-NEXT: ldr x0, [x8] -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: mul x1, x13, x18 -; CHECK-NEXT: add v12.2d, v12.2d, v0.2d -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: mul x1, x12, x18 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: mul x1, x17, x18 -; CHECK-NEXT: add v13.2d, v13.2d, v0.2d -; CHECK-NEXT: add v11.2d, v11.2d, v0.2d -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: mul x18, x16, x18 -; CHECK-NEXT: ldr q14, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x18, x15, x0 -; CHECK-NEXT: add x1, x11, x8 -; CHECK-NEXT: add v10.2d, v10.2d, v0.2d -; CHECK-NEXT: fmov d0, x18 -; CHECK-NEXT: mul x18, x14, x0 -; CHECK-NEXT: ldr x1, [x1, #128] -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x18, x13, x0 -; CHECK-NEXT: add v8.2d, v8.2d, v0.2d -; CHECK-NEXT: add v25.2d, v25.2d, v0.2d -; CHECK-NEXT: add v22.2d, v22.2d, v0.2d -; CHECK-NEXT: add v18.2d, v18.2d, v0.2d -; CHECK-NEXT: add v6.2d, v6.2d, v0.2d -; CHECK-NEXT: add v14.2d, v14.2d, v0.2d -; CHECK-NEXT: fmov d0, x18 -; CHECK-NEXT: mul x18, x12, x0 -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x18, x17, x0 -; CHECK-NEXT: mul x0, x16, x0 -; CHECK-NEXT: add v9.2d, v9.2d, v0.2d -; CHECK-NEXT: add v31.2d, v31.2d, v0.2d -; CHECK-NEXT: add v26.2d, v26.2d, v0.2d -; CHECK-NEXT: add v23.2d, v23.2d, v0.2d -; CHECK-NEXT: add v21.2d, v21.2d, v0.2d -; CHECK-NEXT: add v19.2d, v19.2d, v0.2d -; CHECK-NEXT: add v17.2d, v17.2d, v0.2d -; CHECK-NEXT: add v7.2d, v7.2d, v0.2d -; CHECK-NEXT: add v5.2d, v5.2d, v0.2d -; CHECK-NEXT: add v3.2d, v3.2d, v0.2d -; CHECK-NEXT: add v2.2d, v2.2d, v0.2d -; CHECK-NEXT: fmov d0, x18 -; CHECK-NEXT: mul x15, x15, x1 -; CHECK-NEXT: mov v0.d[1], x0 -; CHECK-NEXT: mul x14, x14, x1 -; CHECK-NEXT: add v30.2d, v30.2d, v0.2d -; CHECK-NEXT: add v24.2d, v24.2d, v0.2d -; CHECK-NEXT: add v20.2d, v20.2d, v0.2d -; CHECK-NEXT: add v16.2d, v16.2d, v0.2d -; CHECK-NEXT: add v4.2d, v4.2d, v0.2d -; CHECK-NEXT: add v1.2d, v1.2d, v0.2d -; CHECK-NEXT: fmov d0, x15 -; CHECK-NEXT: mul x13, x13, x1 -; CHECK-NEXT: mov v0.d[1], x14 -; CHECK-NEXT: mul x12, x12, x1 -; CHECK-NEXT: add v29.2d, v29.2d, v0.2d -; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: mul x17, x17, x1 -; CHECK-NEXT: mov v0.d[1], x12 -; CHECK-NEXT: mul x16, x16, x1 -; CHECK-NEXT: add v28.2d, v28.2d, v0.2d -; CHECK-NEXT: fmov d0, x17 -; CHECK-NEXT: mov v0.d[1], x16 -; CHECK-NEXT: add x8, x8, #8 // =8 -; CHECK-NEXT: add v27.2d, v27.2d, v0.2d -; CHECK-NEXT: cmp x8, #64 // =64 -; CHECK-NEXT: add x9, x9, #1 // =1 -; CHECK-NEXT: str q14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: fmov x14, d14 +; CHECK-NEXT: ldr q14, [x10, #64]! +; CHECK-NEXT: ldr x13, [x13] +; CHECK-NEXT: fmov x16, d15 +; CHECK-NEXT: mov x15, v15.d[1] +; CHECK-NEXT: mov x1, v14.d[1] +; CHECK-NEXT: mul x0, x16, x13 +; CHECK-NEXT: fmov x2, d14 +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x0, x15, x13 +; CHECK-NEXT: ldr x17, [x8], #1 +; CHECK-NEXT: ldr x18, [x9, #8]! +; CHECK-NEXT: mov v14.d[1], x0 +; CHECK-NEXT: mul x0, x14, x13 +; CHECK-NEXT: add v12.2d, v12.2d, v14.2d +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x0, x12, x13 +; CHECK-NEXT: mov v14.d[1], x0 +; CHECK-NEXT: mul x0, x2, x13 +; CHECK-NEXT: add v13.2d, v13.2d, v14.2d +; CHECK-NEXT: add v11.2d, v11.2d, v14.2d +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x13, x1, x13 +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: mul x13, x16, x18 +; CHECK-NEXT: add v10.2d, v10.2d, v14.2d +; CHECK-NEXT: fmov d14, x13 +; CHECK-NEXT: mul x13, x15, x18 +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: mul x13, x14, x17 +; CHECK-NEXT: mul x14, x14, x18 +; CHECK-NEXT: add v29.2d, v29.2d, v14.2d +; CHECK-NEXT: fmov d14, x14 +; CHECK-NEXT: mul x14, x12, x18 +; CHECK-NEXT: mov v14.d[1], x14 +; CHECK-NEXT: mul x0, x2, x18 +; CHECK-NEXT: mul x18, x1, x18 +; CHECK-NEXT: add v28.2d, v28.2d, v14.2d +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x16, x16, x17 +; CHECK-NEXT: mov v14.d[1], x18 +; CHECK-NEXT: mul x15, x15, x17 +; CHECK-NEXT: add v27.2d, v27.2d, v14.2d +; CHECK-NEXT: fmov d14, x16 +; CHECK-NEXT: mov v14.d[1], x15 +; CHECK-NEXT: mul x12, x12, x17 +; CHECK-NEXT: add v8.2d, v8.2d, v14.2d +; CHECK-NEXT: add v25.2d, v25.2d, v14.2d +; CHECK-NEXT: add v22.2d, v22.2d, v14.2d +; CHECK-NEXT: add v18.2d, v18.2d, v14.2d +; CHECK-NEXT: add v6.2d, v6.2d, v14.2d +; CHECK-NEXT: add v0.2d, v0.2d, v14.2d +; CHECK-NEXT: fmov d14, x13 +; CHECK-NEXT: mul x14, x2, x17 +; CHECK-NEXT: mov v14.d[1], x12 +; CHECK-NEXT: mul x13, x1, x17 +; CHECK-NEXT: add v9.2d, v9.2d, v14.2d +; CHECK-NEXT: add v31.2d, v31.2d, v14.2d +; CHECK-NEXT: add v26.2d, v26.2d, v14.2d +; CHECK-NEXT: add v23.2d, v23.2d, v14.2d +; CHECK-NEXT: add v21.2d, v21.2d, v14.2d +; CHECK-NEXT: add v19.2d, v19.2d, v14.2d +; CHECK-NEXT: add v17.2d, v17.2d, v14.2d +; CHECK-NEXT: add v7.2d, v7.2d, v14.2d +; CHECK-NEXT: add v5.2d, v5.2d, v14.2d +; CHECK-NEXT: add v3.2d, v3.2d, v14.2d +; CHECK-NEXT: add v2.2d, v2.2d, v14.2d +; CHECK-NEXT: fmov d14, x14 +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: add v30.2d, v30.2d, v14.2d +; CHECK-NEXT: add v24.2d, v24.2d, v14.2d +; CHECK-NEXT: add v20.2d, v20.2d, v14.2d +; CHECK-NEXT: add v16.2d, v16.2d, v14.2d +; CHECK-NEXT: add v4.2d, v4.2d, v14.2d +; CHECK-NEXT: add v1.2d, v1.2d, v14.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: stp q13, q12, [x8] ; CHECK-NEXT: stp q11, q10, [x8, #32] ; CHECK-NEXT: stp q9, q8, [x8, #64] -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: stp q31, q30, [x8, #96] ; CHECK-NEXT: stp q29, q28, [x8, #144] ; CHECK-NEXT: stp q27, q26, [x8, #176] @@ -171,7 +163,7 @@ ; CHECK-NEXT: stp q4, q3, [x8, #432] ; CHECK-NEXT: stp q0, q2, [x8, #464] ; CHECK-NEXT: str q1, [x8, #496] -; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: br label %for.cond1.preheader Index: llvm/test/CodeGen/AArch64/vldn_shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -4,15 +4,17 @@ define void @vld2(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: sub x8, x1, #16 // =16 +; CHECK-NEXT: sub x9, x0, #32 // =32 +; CHECK-NEXT: mov w10, #1024 ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 +; CHECK-NEXT: add x9, x9, #32 // =32 +; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9] +; CHECK-NEXT: subs x10, x10, #4 // =4 ; CHECK-NEXT: fmul v2.4s, v0.4s, v0.4s ; CHECK-NEXT: fmla v2.4s, v1.4s, v1.4s -; CHECK-NEXT: str q2, [x1, x8] -; CHECK-NEXT: add x8, x8, #16 // =16 -; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 +; CHECK-NEXT: str q2, [x8, #16]! ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret @@ -44,16 +46,18 @@ define void @vld3(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: sub x8, x1, #16 // =16 +; CHECK-NEXT: sub x9, x0, #48 // =48 +; CHECK-NEXT: mov w10, #1024 ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 +; CHECK-NEXT: add x9, x9, #48 // =48 +; CHECK-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x9] +; CHECK-NEXT: subs x10, x10, #4 // =4 ; CHECK-NEXT: fmul v3.4s, v0.4s, v0.4s ; CHECK-NEXT: fmla v3.4s, v1.4s, v1.4s ; CHECK-NEXT: fmla v3.4s, v2.4s, v2.4s -; CHECK-NEXT: str q3, [x1, x8] -; CHECK-NEXT: add x8, x8, #16 // =16 -; CHECK-NEXT: cmp x8, #1, lsl #12 // =4096 +; CHECK-NEXT: str q3, [x8, #16]! ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret @@ -88,18 +92,20 @@ define void @vld4(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: sub x8, x1, #32 // =32 +; CHECK-NEXT: sub x9, x0, #64 // =64 +; CHECK-NEXT: mov w10, #1024 ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 -; CHECK-NEXT: add x9, x1, x8 +; CHECK-NEXT: add x9, x9, #64 // =64 +; CHECK-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x9] ; CHECK-NEXT: add x8, x8, #32 // =32 -; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 +; CHECK-NEXT: subs x10, x10, #4 // =4 ; CHECK-NEXT: fmul v4.4s, v0.4s, v0.4s ; CHECK-NEXT: fmla v4.4s, v1.4s, v1.4s ; CHECK-NEXT: fmul v5.4s, v2.4s, v2.4s ; CHECK-NEXT: fmla v5.4s, v3.4s, v3.4s -; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x9] +; CHECK-NEXT: st2 { v4.4s, v5.4s }, [x8] ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret @@ -139,17 +145,18 @@ ; CHECK-LABEL: twosrc: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: sub x9, x2, #16 // =16 ; CHECK-NEXT: .LBB3_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x10, x1, x8 -; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x9] -; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x10] +; CHECK-NEXT: add x10, x0, x8 +; CHECK-NEXT: add x11, x1, x8 +; CHECK-NEXT: ld2 { v0.4s, v1.4s }, [x10] +; CHECK-NEXT: ld2 { v2.4s, v3.4s }, [x11] ; CHECK-NEXT: add x8, x8, #32 // =32 ; CHECK-NEXT: cmp x8, #2, lsl #12 // =8192 ; CHECK-NEXT: fmul v4.4s, v2.4s, v0.4s ; CHECK-NEXT: fmla v4.4s, v1.4s, v3.4s -; CHECK-NEXT: str q4, [x2], #16 +; CHECK-NEXT: str q4, [x9, #16]! ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret Index: llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll =================================================================== --- llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll +++ llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-memcpy.ll @@ -8,8 +8,8 @@ ; CHECK: testCase ; CHECK: %while.body{{$}} -; CHECK: ldr [[STREG:x[0-9]+]], [{{x[0-9]+}}], #8 -; CHECK-NEXT: str [[STREG]], [{{x[0-9]+}}], #8 +; CHECK: ldr [[STREG:x[0-9]+]], [{{x[0-9]+}}, #8]! +; CHECK-NEXT: str [[STREG]], [{{x[0-9]+}}, #8]! ; CHECK: %while.end define i32 @testCase() nounwind ssp { entry: Index: llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll =================================================================== --- llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll +++ llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-memset.ll @@ -10,7 +10,7 @@ ; CHECK: @memset ; CHECK: %while.body18{{$}} -; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #8 +; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #8]! ; First set the IVREG variable, then use it ; CHECK-NEXT: sub [[IVREG:x[0-9]+]], ; CHECK: [[IVREG]], #8