Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -277,6 +277,10 @@ int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp); + + TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, + ScalarEvolution *SE) const; + /// @} }; Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -9,6 +9,7 @@ #include "AArch64TargetTransformInfo.h" #include "AArch64ExpandImm.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" @@ -1278,3 +1279,47 @@ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } + +TTI::AddressingModeKind +AArch64TTIImpl::getPreferredAddressingMode(const Loop *L, + ScalarEvolution *SE) const { + // Pre-indexed addressing modes will generally introduce base address + // modifying instruction(s) into the preheader and is only really useful for + // unrolled loops, and we don't generally do when optimising for size. + if (L->getHeader()->getParent()->hasOptSize() || + L->getNumBlocks() != 1) + return TTI::AMK_None; + + // Find unrolled loops and pointers with multiple uses within the loop. + DenseMap NumPointerUses; + for (auto &I : *L->getHeader()) { + if (I.getType()->isPointerTy()) + NumPointerUses[&I] = 0; + + for (auto &Use : I.operands()) { + if (!Use->getType()->isPointerTy()) + continue; + if (NumPointerUses.count(Use)) + NumPointerUses[Use]++; + else + NumPointerUses[Use] = 0; + } + } + + bool FavorPreIndexed = + std::any_of(NumPointerUses.begin(), NumPointerUses.end(), + [](detail::DenseMapPair Pair) { + return Pair.second > 1; + }); + + // TODO: With runtime loop unrolling enabled, pre-indexed addressing modes + // give better results and we would like to return AMK_PreIndexed here but a + // missed opportunity in the load/store optimiser results in less LDP/STPs + // not giving the desired overall results, so return AMK_None for now. + if (FavorPreIndexed) + return TTI::AMK_None; + + // With runtime loop unrolling disabled, post-indexed addressing modes give + // better results. + return TTI::AMK_PostIndexed; +} Index: llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll =================================================================== --- llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll +++ llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll @@ -13,7 +13,7 @@ ; x[i] = a[i] + 1; ; } ; -; RUN: llc -mtriple aarch64-linux %s -o - | FileCheck %s +; RUN: llc -mtriple aarch64-linux %s -lsr-preferred-addressing-mode=none -o - | FileCheck %s --check-prefix=CHECK define dso_local void @f(i32 %n, i32* nocapture %x) { entry: Index: llvm/test/CodeGen/AArch64/vldn_shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-none-eabif | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-none-eabif -lsr-preferred-addressing-mode=none | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=aarch64-none-eabif | FileCheck %s --check-prefix=POSTINDEXED +; RUN: llc < %s -mtriple=aarch64-none-eabif -lsr-preferred-addressing-mode=postindexed | FileCheck %s --check-prefix=POSTINDEXED define void @vld2(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld2: @@ -16,6 +18,20 @@ ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: vld2: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: mov w8, #1024 +; POSTINDEXED-NEXT: .LBB0_1: // %vector.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 +; POSTINDEXED-NEXT: subs x8, x8, #4 // =4 +; POSTINDEXED-NEXT: fmul v2.4s, v0.4s, v0.4s +; POSTINDEXED-NEXT: fmla v2.4s, v1.4s, v1.4s +; POSTINDEXED-NEXT: str q2, [x1], #16 +; POSTINDEXED-NEXT: b.ne .LBB0_1 +; POSTINDEXED-NEXT: // %bb.2: // %while.end +; POSTINDEXED-NEXT: ret entry: br label %vector.body @@ -57,6 +73,21 @@ ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: vld3: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: mov w8, #1024 +; POSTINDEXED-NEXT: .LBB1_1: // %vector.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 +; POSTINDEXED-NEXT: subs x8, x8, #4 // =4 +; POSTINDEXED-NEXT: fmul v3.4s, v0.4s, v0.4s +; POSTINDEXED-NEXT: fmla v3.4s, v1.4s, v1.4s +; POSTINDEXED-NEXT: fmla v3.4s, v2.4s, v2.4s +; POSTINDEXED-NEXT: str q3, [x1], #16 +; POSTINDEXED-NEXT: b.ne .LBB1_1 +; POSTINDEXED-NEXT: // %bb.2: // %while.end +; POSTINDEXED-NEXT: ret entry: br label %vector.body @@ -103,6 +134,22 @@ ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: vld4: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: mov w8, #1024 +; POSTINDEXED-NEXT: .LBB2_1: // %vector.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 +; POSTINDEXED-NEXT: subs x8, x8, #4 // =4 +; POSTINDEXED-NEXT: fmul v4.4s, v0.4s, v0.4s +; POSTINDEXED-NEXT: fmla v4.4s, v1.4s, v1.4s +; POSTINDEXED-NEXT: fmul v5.4s, v2.4s, v2.4s +; POSTINDEXED-NEXT: fmla v5.4s, v3.4s, v3.4s +; POSTINDEXED-NEXT: st2 { v4.4s, v5.4s }, [x1], #32 +; POSTINDEXED-NEXT: b.ne .LBB2_1 +; POSTINDEXED-NEXT: // %bb.2: // %while.end +; POSTINDEXED-NEXT: ret entry: br label %vector.body @@ -153,6 +200,21 @@ ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: twosrc: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: mov w8, #1024 +; POSTINDEXED-NEXT: .LBB3_1: // %vector.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 +; POSTINDEXED-NEXT: ld2 { v2.4s, v3.4s }, [x1], #32 +; POSTINDEXED-NEXT: subs x8, x8, #4 // =4 +; POSTINDEXED-NEXT: fmul v4.4s, v2.4s, v0.4s +; POSTINDEXED-NEXT: fmla v4.4s, v1.4s, v3.4s +; POSTINDEXED-NEXT: str q4, [x2], #16 +; POSTINDEXED-NEXT: b.ne .LBB3_1 +; POSTINDEXED-NEXT: // %bb.2: // %while.end +; POSTINDEXED-NEXT: ret entry: br label %vector.body Index: llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll =================================================================== --- llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll +++ llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -lsr-preferred-addressing-mode=postindexed | FileCheck %s --check-prefix=POSTINDEXED +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -lsr-preferred-addressing-mode=preindexed | FileCheck %s --check-prefix=PREINDEXED +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -lsr-preferred-addressing-mode=none | FileCheck %s --check-prefix=AMKNONE ; Test LSR for giving small constants, which get re-associated as unfolded ; offset, a chance to get combined with loop-invariant registers (same as @@ -36,6 +39,74 @@ ; CHECK-NEXT: .LBB0_5: // %cleanup2 ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: test1: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: cbz x1, .LBB0_4 +; POSTINDEXED-NEXT: // %bb.1: // %for.body.preheader +; POSTINDEXED-NEXT: add x8, x0, x1, lsl #2 +; POSTINDEXED-NEXT: add x8, x8, #28 // =28 +; POSTINDEXED-NEXT: .LBB0_2: // %for.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ldr s1, [x8] +; POSTINDEXED-NEXT: fcmp s1, s0 +; POSTINDEXED-NEXT: b.gt .LBB0_5 +; POSTINDEXED-NEXT: // %bb.3: // %for.cond +; POSTINDEXED-NEXT: // in Loop: Header=BB0_2 Depth=1 +; POSTINDEXED-NEXT: add x1, x1, #1 // =1 +; POSTINDEXED-NEXT: add x8, x8, #4 // =4 +; POSTINDEXED-NEXT: cbnz x1, .LBB0_2 +; POSTINDEXED-NEXT: .LBB0_4: +; POSTINDEXED-NEXT: fmov s0, #-7.00000000 +; POSTINDEXED-NEXT: ret +; POSTINDEXED-NEXT: .LBB0_5: // %cleanup2 +; POSTINDEXED-NEXT: mov v0.16b, v1.16b +; POSTINDEXED-NEXT: ret +; +; PREINDEXED-LABEL: test1: +; PREINDEXED: // %bb.0: // %entry +; PREINDEXED-NEXT: cbz x1, .LBB0_4 +; PREINDEXED-NEXT: // %bb.1: // %for.body.preheader +; PREINDEXED-NEXT: add x9, x0, x1, lsl #2 +; PREINDEXED-NEXT: neg x8, x1 +; PREINDEXED-NEXT: add x9, x9, #24 // =24 +; PREINDEXED-NEXT: .LBB0_2: // %for.body +; PREINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; PREINDEXED-NEXT: ldr s1, [x9, #4] +; PREINDEXED-NEXT: fcmp s1, s0 +; PREINDEXED-NEXT: b.gt .LBB0_5 +; PREINDEXED-NEXT: // %bb.3: // %for.cond +; PREINDEXED-NEXT: // in Loop: Header=BB0_2 Depth=1 +; PREINDEXED-NEXT: subs x8, x8, #1 // =1 +; PREINDEXED-NEXT: add x9, x9, #4 // =4 +; PREINDEXED-NEXT: b.ne .LBB0_2 +; PREINDEXED-NEXT: .LBB0_4: +; PREINDEXED-NEXT: fmov s0, #-7.00000000 +; PREINDEXED-NEXT: ret +; PREINDEXED-NEXT: .LBB0_5: // %cleanup2 +; PREINDEXED-NEXT: mov v0.16b, v1.16b +; PREINDEXED-NEXT: ret +; +; AMKNONE-LABEL: test1: +; AMKNONE: // %bb.0: // %entry +; AMKNONE-NEXT: cbz x1, .LBB0_4 +; AMKNONE-NEXT: // %bb.1: // %for.body.preheader +; AMKNONE-NEXT: add x8, x0, #28 // =28 +; AMKNONE-NEXT: .LBB0_2: // %for.body +; AMKNONE-NEXT: // =>This Inner Loop Header: Depth=1 +; AMKNONE-NEXT: ldr s1, [x8, x1, lsl #2] +; AMKNONE-NEXT: fcmp s1, s0 +; AMKNONE-NEXT: b.gt .LBB0_5 +; AMKNONE-NEXT: // %bb.3: // %for.cond +; AMKNONE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; AMKNONE-NEXT: add x1, x1, #1 // =1 +; AMKNONE-NEXT: cbnz x1, .LBB0_2 +; AMKNONE-NEXT: .LBB0_4: +; AMKNONE-NEXT: fmov s0, #-7.00000000 +; AMKNONE-NEXT: ret +; AMKNONE-NEXT: .LBB0_5: // %cleanup2 +; AMKNONE-NEXT: mov v0.16b, v1.16b +; AMKNONE-NEXT: ret entry: %cmp11 = icmp eq i64 %start, 0 br i1 %cmp11, label %cleanup2, label %for.body @@ -83,6 +154,77 @@ ; CHECK-NEXT: .LBB1_5: // %cleanup4 ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: test2: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: cbz x1, .LBB1_4 +; POSTINDEXED-NEXT: // %bb.1: // %for.body.preheader +; POSTINDEXED-NEXT: add x8, x0, x1, lsl #2 +; POSTINDEXED-NEXT: add x8, x8, #28 // =28 +; POSTINDEXED-NEXT: .LBB1_2: // %for.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ldr s1, [x8] +; POSTINDEXED-NEXT: scvtf s2, x1 +; POSTINDEXED-NEXT: fadd s2, s2, s0 +; POSTINDEXED-NEXT: fcmp s1, s2 +; POSTINDEXED-NEXT: b.gt .LBB1_5 +; POSTINDEXED-NEXT: // %bb.3: // %for.cond +; POSTINDEXED-NEXT: // in Loop: Header=BB1_2 Depth=1 +; POSTINDEXED-NEXT: add x1, x1, #1 // =1 +; POSTINDEXED-NEXT: add x8, x8, #4 // =4 +; POSTINDEXED-NEXT: cbnz x1, .LBB1_2 +; POSTINDEXED-NEXT: .LBB1_4: +; POSTINDEXED-NEXT: fmov s0, #-7.00000000 +; POSTINDEXED-NEXT: ret +; POSTINDEXED-NEXT: .LBB1_5: // %cleanup4 +; POSTINDEXED-NEXT: mov v0.16b, v1.16b +; POSTINDEXED-NEXT: ret +; +; PREINDEXED-LABEL: test2: +; PREINDEXED: // %bb.0: // %entry +; PREINDEXED-NEXT: cbz x1, .LBB1_4 +; PREINDEXED-NEXT: // %bb.1: // %for.body.preheader +; PREINDEXED-NEXT: add x8, x0, #28 // =28 +; PREINDEXED-NEXT: .LBB1_2: // %for.body +; PREINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; PREINDEXED-NEXT: ldr s1, [x8, x1, lsl #2] +; PREINDEXED-NEXT: scvtf s2, x1 +; PREINDEXED-NEXT: fadd s2, s2, s0 +; PREINDEXED-NEXT: fcmp s1, s2 +; PREINDEXED-NEXT: b.gt .LBB1_5 +; PREINDEXED-NEXT: // %bb.3: // %for.cond +; PREINDEXED-NEXT: // in Loop: Header=BB1_2 Depth=1 +; PREINDEXED-NEXT: add x1, x1, #1 // =1 +; PREINDEXED-NEXT: cbnz x1, .LBB1_2 +; PREINDEXED-NEXT: .LBB1_4: +; PREINDEXED-NEXT: fmov s0, #-7.00000000 +; PREINDEXED-NEXT: ret +; PREINDEXED-NEXT: .LBB1_5: // %cleanup4 +; PREINDEXED-NEXT: mov v0.16b, v1.16b +; PREINDEXED-NEXT: ret +; +; AMKNONE-LABEL: test2: +; AMKNONE: // %bb.0: // %entry +; AMKNONE-NEXT: cbz x1, .LBB1_4 +; AMKNONE-NEXT: // %bb.1: // %for.body.preheader +; AMKNONE-NEXT: add x8, x0, #28 // =28 +; AMKNONE-NEXT: .LBB1_2: // %for.body +; AMKNONE-NEXT: // =>This Inner Loop Header: Depth=1 +; AMKNONE-NEXT: ldr s1, [x8, x1, lsl #2] +; AMKNONE-NEXT: scvtf s2, x1 +; AMKNONE-NEXT: fadd s2, s2, s0 +; AMKNONE-NEXT: fcmp s1, s2 +; AMKNONE-NEXT: b.gt .LBB1_5 +; AMKNONE-NEXT: // %bb.3: // %for.cond +; AMKNONE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; AMKNONE-NEXT: add x1, x1, #1 // =1 +; AMKNONE-NEXT: cbnz x1, .LBB1_2 +; AMKNONE-NEXT: .LBB1_4: +; AMKNONE-NEXT: fmov s0, #-7.00000000 +; AMKNONE-NEXT: ret +; AMKNONE-NEXT: .LBB1_5: // %cleanup4 +; AMKNONE-NEXT: mov v0.16b, v1.16b +; AMKNONE-NEXT: ret entry: %cmp14 = icmp eq i64 %start, 0 br i1 %cmp14, label %cleanup4, label %for.body Index: llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-factor-out-constant.ll =================================================================== --- llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-factor-out-constant.ll +++ llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-factor-out-constant.ll @@ -1,15 +1,78 @@ -; RUN: opt -S -loop-reduce < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -loop-reduce < %s | FileCheck %s --check-prefix=CHECK +; RUN: opt -S -loop-reduce -lsr-preferred-addressing-mode=none < %s | FileCheck %s --check-prefix=CHECK +; RUN: opt -S -loop-reduce -lsr-preferred-addressing-mode=postindexed < %s | FileCheck %s --check-prefix=POST target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" ; This test check SCEVExpander FactorOutConstant() is not crashing with blind cast 'Factor' to SCEVConstant. -; CHECK-LABEL: test ; FIXME: Handle VectorType in SCEVExpander::expandAddToGEP. ; The generated IR is not ideal with base 'scalar_vector' cast to i8*, and do ugly getelementptr over casted base. -; CHECK: uglygep +; define void @test(i32* %a, i32 %v, i64 %n) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SCALAR_VECTOR:%.*]] = alloca , align 16 +; CHECK-NEXT: [[SCALAR_VECTOR1:%.*]] = bitcast * [[SCALAR_VECTOR]] to i8* +; CHECK-NEXT: [[NUM_ELM:%.*]] = call i64 @llvm.aarch64.sve.cntw(i32 31) +; CHECK-NEXT: [[SCALAR_COUNT:%.*]] = and i64 [[NUM_ELM]], -4 +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop_header: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_LOOP:%.*]] ] +; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 0 +; CHECK-NEXT: br label [[SCALAR_LOOP:%.*]] +; CHECK: scalar_loop: +; CHECK-NEXT: [[SCALAR_IV:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[SCALAR_IV_NEXT:%.*]], [[SCALAR_LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[SCALAR_IV]], 2 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCALAR_VECTOR1]], i64 [[TMP0]] +; CHECK-NEXT: [[UGLYGEP2:%.*]] = bitcast i8* [[UGLYGEP]] to i32* +; CHECK-NEXT: store i32 [[V:%.*]], i32* [[UGLYGEP2]], align 4 +; CHECK-NEXT: [[SCALAR_IV_NEXT]] = add i64 [[SCALAR_IV]], 1 +; CHECK-NEXT: [[SCALAR_EXIT:%.*]] = icmp eq i64 [[SCALAR_COUNT]], [[SCALAR_IV_NEXT]] +; CHECK-NEXT: br i1 [[SCALAR_EXIT]], label [[FOR_LOOP]], label [[SCALAR_LOOP]] +; CHECK: for_loop: +; CHECK-NEXT: [[VECTOR:%.*]] = load , * [[SCALAR_VECTOR]], align 16 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, i32* [[GEP_A_0]], i64 [[INDVAR]] +; CHECK-NEXT: [[VECTOR_PTR:%.*]] = bitcast i32* [[GEP_A]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[VECTOR]], * [[VECTOR_PTR]], i32 4, undef) +; CHECK-NEXT: [[INDVAR_NEXT]] = add nsw i64 [[INDVAR]], [[SCALAR_COUNT]] +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label [[EXIT:%.*]], label [[LOOP_HEADER]] +; +; POST-LABEL: @test( +; POST-NEXT: entry: +; POST-NEXT: [[SCALAR_VECTOR:%.*]] = alloca , align 16 +; POST-NEXT: [[NUM_ELM:%.*]] = call i64 @llvm.aarch64.sve.cntw(i32 31) +; POST-NEXT: [[SCALAR_COUNT:%.*]] = and i64 [[NUM_ELM]], -4 +; POST-NEXT: br label [[LOOP_HEADER:%.*]] +; POST: exit: +; POST-NEXT: ret void +; POST: loop_header: +; POST-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_LOOP:%.*]] ] +; POST-NEXT: [[GEP_A_0:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 0 +; POST-NEXT: [[GEP_VEC_0:%.*]] = getelementptr inbounds , * [[SCALAR_VECTOR]], i64 0, i64 0 +; POST-NEXT: br label [[SCALAR_LOOP:%.*]] +; POST: scalar_loop: +; POST-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[SCALAR_LOOP]] ], [ [[SCALAR_COUNT]], [[LOOP_HEADER]] ] +; POST-NEXT: [[GEP_VEC:%.*]] = phi i32* [ [[GEP_VEC_0]], [[LOOP_HEADER]] ], [ [[GEP_VEC_INC:%.*]], [[SCALAR_LOOP]] ] +; POST-NEXT: store i32 [[V:%.*]], i32* [[GEP_VEC]], align 4 +; POST-NEXT: [[GEP_VEC_INC]] = getelementptr i32, i32* [[GEP_VEC]], i64 1 +; POST-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 +; POST-NEXT: [[SCALAR_EXIT:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; POST-NEXT: br i1 [[SCALAR_EXIT]], label [[FOR_LOOP]], label [[SCALAR_LOOP]] +; POST: for_loop: +; POST-NEXT: [[VECTOR:%.*]] = load , * [[SCALAR_VECTOR]], align 16 +; POST-NEXT: [[GEP_A:%.*]] = getelementptr i32, i32* [[GEP_A_0]], i64 [[INDVAR]] +; POST-NEXT: [[VECTOR_PTR:%.*]] = bitcast i32* [[GEP_A]] to * +; POST-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[VECTOR]], * [[VECTOR_PTR]], i32 4, undef) +; POST-NEXT: [[INDVAR_NEXT]] = add nsw i64 [[INDVAR]], [[SCALAR_COUNT]] +; POST-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[N:%.*]] +; POST-NEXT: br i1 [[EXIT_COND]], label [[EXIT:%.*]], label [[LOOP_HEADER]] +; entry: %scalar_vector = alloca , align 16 %num_elm = call i64 @llvm.aarch64.sve.cntw(i32 31)