Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -277,6 +277,10 @@ int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp); + + TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, + ScalarEvolution *SE) const; + /// @} }; Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -9,6 +9,7 @@ #include "AArch64TargetTransformInfo.h" #include "AArch64ExpandImm.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" @@ -1278,3 +1279,28 @@ return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } + +TTI::AddressingModeKind +AArch64TTIImpl::getPreferredAddressingMode(const Loop *L, + ScalarEvolution *SE) const { + InductionDescriptor IndDesc; + if (!L->getInductionDescriptor(*SE, IndDesc)) + return TTI::AMK_None; + + ConstantInt *CI = IndDesc.getConstIntStepValue(); + if (!CI) + return TTI::AMK_None; + + // With runtime loop unrolling disabled, PostIndexed addressing modes give + // better results. This heuristic checks if the loop induction variable + // update is 16 or less, which means that vectorisation could have been + // applied (with a vectorisation factor of up to 16), but no further runtime + // unrolling, in which case the loop IV update would be bigger than 16. + if (CI->getZExtValue() <= 16) + return TTI::AMK_PostIndexed; + + // TODO: With runtime loop unrolling enabled, PreIndexed addressing modes give + // better results. But we need to enable runtime unrolling first and then + // get some more data on this, so return AMK_None for now. + return TTI::AMK_None; +} Index: llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll =================================================================== --- llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -8,12 +8,11 @@ define dso_local void @run_test() local_unnamed_addr #0 { ; CHECK-LABEL: run_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #80 // =80 -; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: .cfi_offset b8, -8 ; CHECK-NEXT: .cfi_offset b9, -16 ; CHECK-NEXT: .cfi_offset b10, -24 @@ -22,14 +21,13 @@ ; CHECK-NEXT: .cfi_offset b13, -48 ; CHECK-NEXT: .cfi_offset b14, -56 ; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: adrp x9, A+128 ; CHECK-NEXT: adrp x10, B+48 -; CHECK-NEXT: adrp x11, A ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: add x9, x9, :lo12:A+128 ; CHECK-NEXT: add x10, x10, :lo12:B+48 -; CHECK-NEXT: add x11, x11, :lo12:A -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov w11, #8 ; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: // implicit-def: $q2 ; CHECK-NEXT: // implicit-def: $q3 @@ -61,103 +59,96 @@ ; CHECK-NEXT: // implicit-def: $q13 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: mov x13, xzr +; CHECK-NEXT: ldr q14, [x13] ; CHECK-NEXT: ldr q15, [x8] -; CHECK-NEXT: ldr q14, [x12] -; CHECK-NEXT: ldr q0, [x10], #64 -; CHECK-NEXT: ldr x18, [x12] -; CHECK-NEXT: fmov x15, d15 -; CHECK-NEXT: mov x14, v15.d[1] -; CHECK-NEXT: fmov x13, d14 -; CHECK-NEXT: mul x1, x15, x18 -; CHECK-NEXT: mov x16, v0.d[1] -; CHECK-NEXT: fmov x17, d0 -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: mul x1, x14, x18 +; CHECK-NEXT: subs x11, x11, #1 // =1 ; CHECK-NEXT: mov x12, v14.d[1] -; CHECK-NEXT: ldr x0, [x8] -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: mul x1, x13, x18 -; CHECK-NEXT: add v12.2d, v12.2d, v0.2d -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: mul x1, x12, x18 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: mul x1, x17, x18 -; CHECK-NEXT: add v13.2d, v13.2d, v0.2d -; CHECK-NEXT: add v11.2d, v11.2d, v0.2d -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: mul x18, x16, x18 -; CHECK-NEXT: ldr q14, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x18, x15, x0 -; CHECK-NEXT: add x1, x11, x8 -; CHECK-NEXT: add v10.2d, v10.2d, v0.2d -; CHECK-NEXT: fmov d0, x18 -; CHECK-NEXT: mul x18, x14, x0 -; CHECK-NEXT: ldr x1, [x1, #128] -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x18, x13, x0 -; CHECK-NEXT: add v8.2d, v8.2d, v0.2d -; CHECK-NEXT: add v25.2d, v25.2d, v0.2d -; CHECK-NEXT: add v22.2d, v22.2d, v0.2d -; CHECK-NEXT: add v18.2d, v18.2d, v0.2d -; CHECK-NEXT: add v6.2d, v6.2d, v0.2d -; CHECK-NEXT: add v14.2d, v14.2d, v0.2d -; CHECK-NEXT: fmov d0, x18 -; CHECK-NEXT: mul x18, x12, x0 -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x18, x17, x0 -; CHECK-NEXT: mul x0, x16, x0 -; CHECK-NEXT: add v9.2d, v9.2d, v0.2d -; CHECK-NEXT: add v31.2d, v31.2d, v0.2d -; CHECK-NEXT: add v26.2d, v26.2d, v0.2d -; CHECK-NEXT: add v23.2d, v23.2d, v0.2d -; CHECK-NEXT: add v21.2d, v21.2d, v0.2d -; CHECK-NEXT: add v19.2d, v19.2d, v0.2d -; CHECK-NEXT: add v17.2d, v17.2d, v0.2d -; CHECK-NEXT: add v7.2d, v7.2d, v0.2d -; CHECK-NEXT: add v5.2d, v5.2d, v0.2d -; CHECK-NEXT: add v3.2d, v3.2d, v0.2d -; CHECK-NEXT: add v2.2d, v2.2d, v0.2d -; CHECK-NEXT: fmov d0, x18 -; CHECK-NEXT: mul x15, x15, x1 -; CHECK-NEXT: mov v0.d[1], x0 -; CHECK-NEXT: mul x14, x14, x1 -; CHECK-NEXT: add v30.2d, v30.2d, v0.2d -; CHECK-NEXT: add v24.2d, v24.2d, v0.2d -; CHECK-NEXT: add v20.2d, v20.2d, v0.2d -; CHECK-NEXT: add v16.2d, v16.2d, v0.2d -; CHECK-NEXT: add v4.2d, v4.2d, v0.2d -; CHECK-NEXT: add v1.2d, v1.2d, v0.2d -; CHECK-NEXT: fmov d0, x15 -; CHECK-NEXT: mul x13, x13, x1 -; CHECK-NEXT: mov v0.d[1], x14 -; CHECK-NEXT: mul x12, x12, x1 -; CHECK-NEXT: add v29.2d, v29.2d, v0.2d -; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: mul x17, x17, x1 -; CHECK-NEXT: mov v0.d[1], x12 -; CHECK-NEXT: mul x16, x16, x1 -; CHECK-NEXT: add v28.2d, v28.2d, v0.2d -; CHECK-NEXT: fmov d0, x17 -; CHECK-NEXT: mov v0.d[1], x16 -; CHECK-NEXT: add x8, x8, #8 // =8 -; CHECK-NEXT: add v27.2d, v27.2d, v0.2d -; CHECK-NEXT: cmp x8, #64 // =64 -; CHECK-NEXT: add x9, x9, #1 // =1 -; CHECK-NEXT: str q14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: fmov x14, d14 +; CHECK-NEXT: ldr q14, [x10], #64 +; CHECK-NEXT: ldr x13, [x13] +; CHECK-NEXT: fmov x16, d15 +; CHECK-NEXT: mov x15, v15.d[1] +; CHECK-NEXT: mov x1, v14.d[1] +; CHECK-NEXT: mul x0, x16, x13 +; CHECK-NEXT: fmov x2, d14 +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x0, x15, x13 +; CHECK-NEXT: ldr x17, [x8], #1 +; CHECK-NEXT: ldr x18, [x9], #8 +; CHECK-NEXT: mov v14.d[1], x0 +; CHECK-NEXT: mul x0, x14, x13 +; CHECK-NEXT: add v12.2d, v12.2d, v14.2d +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x0, x12, x13 +; CHECK-NEXT: mov v14.d[1], x0 +; CHECK-NEXT: mul x0, x2, x13 +; CHECK-NEXT: add v13.2d, v13.2d, v14.2d +; CHECK-NEXT: add v11.2d, v11.2d, v14.2d +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x13, x1, x13 +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: mul x13, x16, x18 +; CHECK-NEXT: add v10.2d, v10.2d, v14.2d +; CHECK-NEXT: fmov d14, x13 +; CHECK-NEXT: mul x13, x15, x18 +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: mul x13, x14, x17 +; CHECK-NEXT: mul x14, x14, x18 +; CHECK-NEXT: add v29.2d, v29.2d, v14.2d +; CHECK-NEXT: fmov d14, x14 +; CHECK-NEXT: mul x14, x12, x18 +; CHECK-NEXT: mov v14.d[1], x14 +; CHECK-NEXT: mul x0, x2, x18 +; CHECK-NEXT: mul x18, x1, x18 +; CHECK-NEXT: add v28.2d, v28.2d, v14.2d +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x16, x16, x17 +; CHECK-NEXT: mov v14.d[1], x18 +; CHECK-NEXT: mul x15, x15, x17 +; CHECK-NEXT: add v27.2d, v27.2d, v14.2d +; CHECK-NEXT: fmov d14, x16 +; CHECK-NEXT: mov v14.d[1], x15 +; CHECK-NEXT: mul x12, x12, x17 +; CHECK-NEXT: add v8.2d, v8.2d, v14.2d +; CHECK-NEXT: add v25.2d, v25.2d, v14.2d +; CHECK-NEXT: add v22.2d, v22.2d, v14.2d +; CHECK-NEXT: add v18.2d, v18.2d, v14.2d +; CHECK-NEXT: add v6.2d, v6.2d, v14.2d +; CHECK-NEXT: add v0.2d, v0.2d, v14.2d +; CHECK-NEXT: fmov d14, x13 +; CHECK-NEXT: mul x14, x2, x17 +; CHECK-NEXT: mov v14.d[1], x12 +; CHECK-NEXT: mul x13, x1, x17 +; CHECK-NEXT: add v9.2d, v9.2d, v14.2d +; CHECK-NEXT: add v31.2d, v31.2d, v14.2d +; CHECK-NEXT: add v26.2d, v26.2d, v14.2d +; CHECK-NEXT: add v23.2d, v23.2d, v14.2d +; CHECK-NEXT: add v21.2d, v21.2d, v14.2d +; CHECK-NEXT: add v19.2d, v19.2d, v14.2d +; CHECK-NEXT: add v17.2d, v17.2d, v14.2d +; CHECK-NEXT: add v7.2d, v7.2d, v14.2d +; CHECK-NEXT: add v5.2d, v5.2d, v14.2d +; CHECK-NEXT: add v3.2d, v3.2d, v14.2d +; CHECK-NEXT: add v2.2d, v2.2d, v14.2d +; CHECK-NEXT: fmov d14, x14 +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: add v30.2d, v30.2d, v14.2d +; CHECK-NEXT: add v24.2d, v24.2d, v14.2d +; CHECK-NEXT: add v20.2d, v20.2d, v14.2d +; CHECK-NEXT: add v16.2d, v16.2d, v14.2d +; CHECK-NEXT: add v4.2d, v4.2d, v14.2d +; CHECK-NEXT: add v1.2d, v1.2d, v14.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: stp q13, q12, [x8] ; CHECK-NEXT: stp q11, q10, [x8, #32] ; CHECK-NEXT: stp q9, q8, [x8, #64] -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: stp q31, q30, [x8, #96] ; CHECK-NEXT: stp q29, q28, [x8, #144] ; CHECK-NEXT: stp q27, q26, [x8, #176] @@ -171,7 +162,7 @@ ; CHECK-NEXT: stp q4, q3, [x8, #432] ; CHECK-NEXT: stp q0, q2, [x8, #464] ; CHECK-NEXT: str q1, [x8, #496] -; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: br label %for.cond1.preheader Index: llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll =================================================================== --- llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll +++ llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll @@ -13,7 +13,7 @@ ; x[i] = a[i] + 1; ; } ; -; RUN: llc -mtriple aarch64-linux %s -o - | FileCheck %s +; RUN: llc -mtriple aarch64-linux %s -lsr-preferred-addressing-mode=none -o - | FileCheck %s --check-prefix=CHECK define dso_local void @f(i32 %n, i32* nocapture %x) { entry: Index: llvm/test/CodeGen/AArch64/vldn_shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-none-eabif | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-none-eabif -lsr-preferred-addressing-mode=none | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=aarch64-none-eabif | FileCheck %s --check-prefix=POSTINDEXED +; RUN: llc < %s -mtriple=aarch64-none-eabif -lsr-preferred-addressing-mode=postindexed | FileCheck %s --check-prefix=POSTINDEXED define void @vld2(float* nocapture readonly %pSrc, float* noalias nocapture %pDst, i32 %numSamples) { ; CHECK-LABEL: vld2: @@ -16,6 +18,20 @@ ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: vld2: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: mov w8, #1024 +; POSTINDEXED-NEXT: .LBB0_1: // %vector.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 +; POSTINDEXED-NEXT: subs x8, x8, #4 // =4 +; POSTINDEXED-NEXT: fmul v2.4s, v0.4s, v0.4s +; POSTINDEXED-NEXT: fmla v2.4s, v1.4s, v1.4s +; POSTINDEXED-NEXT: str q2, [x1], #16 +; POSTINDEXED-NEXT: b.ne .LBB0_1 +; POSTINDEXED-NEXT: // %bb.2: // %while.end +; POSTINDEXED-NEXT: ret entry: br label %vector.body @@ -57,6 +73,21 @@ ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: vld3: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: mov w8, #1024 +; POSTINDEXED-NEXT: .LBB1_1: // %vector.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ld3 { v0.4s, v1.4s, v2.4s }, [x0], #48 +; POSTINDEXED-NEXT: subs x8, x8, #4 // =4 +; POSTINDEXED-NEXT: fmul v3.4s, v0.4s, v0.4s +; POSTINDEXED-NEXT: fmla v3.4s, v1.4s, v1.4s +; POSTINDEXED-NEXT: fmla v3.4s, v2.4s, v2.4s +; POSTINDEXED-NEXT: str q3, [x1], #16 +; POSTINDEXED-NEXT: b.ne .LBB1_1 +; POSTINDEXED-NEXT: // %bb.2: // %while.end +; POSTINDEXED-NEXT: ret entry: br label %vector.body @@ -103,6 +134,22 @@ ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: vld4: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: mov w8, #1024 +; POSTINDEXED-NEXT: .LBB2_1: // %vector.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64 +; POSTINDEXED-NEXT: subs x8, x8, #4 // =4 +; POSTINDEXED-NEXT: fmul v4.4s, v0.4s, v0.4s +; POSTINDEXED-NEXT: fmla v4.4s, v1.4s, v1.4s +; POSTINDEXED-NEXT: fmul v5.4s, v2.4s, v2.4s +; POSTINDEXED-NEXT: fmla v5.4s, v3.4s, v3.4s +; POSTINDEXED-NEXT: st2 { v4.4s, v5.4s }, [x1], #32 +; POSTINDEXED-NEXT: b.ne .LBB2_1 +; POSTINDEXED-NEXT: // %bb.2: // %while.end +; POSTINDEXED-NEXT: ret entry: br label %vector.body @@ -153,6 +200,21 @@ ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.2: // %while.end ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: twosrc: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: mov w8, #1024 +; POSTINDEXED-NEXT: .LBB3_1: // %vector.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ld2 { v0.4s, v1.4s }, [x0], #32 +; POSTINDEXED-NEXT: ld2 { v2.4s, v3.4s }, [x1], #32 +; POSTINDEXED-NEXT: subs x8, x8, #4 // =4 +; POSTINDEXED-NEXT: fmul v4.4s, v2.4s, v0.4s +; POSTINDEXED-NEXT: fmla v4.4s, v1.4s, v3.4s +; POSTINDEXED-NEXT: str q4, [x2], #16 +; POSTINDEXED-NEXT: b.ne .LBB3_1 +; POSTINDEXED-NEXT: // %bb.2: // %while.end +; POSTINDEXED-NEXT: ret entry: br label %vector.body Index: llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll =================================================================== --- llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll +++ llvm/test/Transforms/LoopStrengthReduce/AArch64/small-constant.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -lsr-preferred-addressing-mode=postindexed | FileCheck %s --check-prefix=POSTINDEXED +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -lsr-preferred-addressing-mode=preindexed | FileCheck %s --check-prefix=PREINDEXED +; RUN: llc < %s -mtriple=aarch64-unknown-unknown -lsr-preferred-addressing-mode=none | FileCheck %s --check-prefix=AMKNONE ; Test LSR for giving small constants, which get re-associated as unfolded ; offset, a chance to get combined with loop-invariant registers (same as @@ -20,15 +23,17 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cbz x1, .LBB0_4 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: add x8, x0, #28 // =28 +; CHECK-NEXT: add x8, x0, x1, lsl #2 +; CHECK-NEXT: add x8, x8, #28 // =28 ; CHECK-NEXT: .LBB0_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr s1, [x8, x1, lsl #2] +; CHECK-NEXT: ldr s1, [x8] ; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: b.gt .LBB0_5 ; CHECK-NEXT: // %bb.3: // %for.cond ; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: add x1, x1, #1 // =1 +; CHECK-NEXT: add x8, x8, #4 // =4 ; CHECK-NEXT: cbnz x1, .LBB0_2 ; CHECK-NEXT: .LBB0_4: ; CHECK-NEXT: fmov s0, #-7.00000000 @@ -36,6 +41,74 @@ ; CHECK-NEXT: .LBB0_5: // %cleanup2 ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: test1: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: cbz x1, .LBB0_4 +; POSTINDEXED-NEXT: // %bb.1: // %for.body.preheader +; POSTINDEXED-NEXT: add x8, x0, x1, lsl #2 +; POSTINDEXED-NEXT: add x8, x8, #28 // =28 +; POSTINDEXED-NEXT: .LBB0_2: // %for.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ldr s1, [x8] +; POSTINDEXED-NEXT: fcmp s1, s0 +; POSTINDEXED-NEXT: b.gt .LBB0_5 +; POSTINDEXED-NEXT: // %bb.3: // %for.cond +; POSTINDEXED-NEXT: // in Loop: Header=BB0_2 Depth=1 +; POSTINDEXED-NEXT: add x1, x1, #1 // =1 +; POSTINDEXED-NEXT: add x8, x8, #4 // =4 +; POSTINDEXED-NEXT: cbnz x1, .LBB0_2 +; POSTINDEXED-NEXT: .LBB0_4: +; POSTINDEXED-NEXT: fmov s0, #-7.00000000 +; POSTINDEXED-NEXT: ret +; POSTINDEXED-NEXT: .LBB0_5: // %cleanup2 +; POSTINDEXED-NEXT: mov v0.16b, v1.16b +; POSTINDEXED-NEXT: ret +; +; PREINDEXED-LABEL: test1: +; PREINDEXED: // %bb.0: // %entry +; PREINDEXED-NEXT: cbz x1, .LBB0_4 +; PREINDEXED-NEXT: // %bb.1: // %for.body.preheader +; PREINDEXED-NEXT: add x9, x0, x1, lsl #2 +; PREINDEXED-NEXT: neg x8, x1 +; PREINDEXED-NEXT: add x9, x9, #24 // =24 +; PREINDEXED-NEXT: .LBB0_2: // %for.body +; PREINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; PREINDEXED-NEXT: ldr s1, [x9, #4] +; PREINDEXED-NEXT: fcmp s1, s0 +; PREINDEXED-NEXT: b.gt .LBB0_5 +; PREINDEXED-NEXT: // %bb.3: // %for.cond +; PREINDEXED-NEXT: // in Loop: Header=BB0_2 Depth=1 +; PREINDEXED-NEXT: subs x8, x8, #1 // =1 +; PREINDEXED-NEXT: add x9, x9, #4 // =4 +; PREINDEXED-NEXT: b.ne .LBB0_2 +; PREINDEXED-NEXT: .LBB0_4: +; PREINDEXED-NEXT: fmov s0, #-7.00000000 +; PREINDEXED-NEXT: ret +; PREINDEXED-NEXT: .LBB0_5: // %cleanup2 +; PREINDEXED-NEXT: mov v0.16b, v1.16b +; PREINDEXED-NEXT: ret +; +; AMKNONE-LABEL: test1: +; AMKNONE: // %bb.0: // %entry +; AMKNONE-NEXT: cbz x1, .LBB0_4 +; AMKNONE-NEXT: // %bb.1: // %for.body.preheader +; AMKNONE-NEXT: add x8, x0, #28 // =28 +; AMKNONE-NEXT: .LBB0_2: // %for.body +; AMKNONE-NEXT: // =>This Inner Loop Header: Depth=1 +; AMKNONE-NEXT: ldr s1, [x8, x1, lsl #2] +; AMKNONE-NEXT: fcmp s1, s0 +; AMKNONE-NEXT: b.gt .LBB0_5 +; AMKNONE-NEXT: // %bb.3: // %for.cond +; AMKNONE-NEXT: // in Loop: Header=BB0_2 Depth=1 +; AMKNONE-NEXT: add x1, x1, #1 // =1 +; AMKNONE-NEXT: cbnz x1, .LBB0_2 +; AMKNONE-NEXT: .LBB0_4: +; AMKNONE-NEXT: fmov s0, #-7.00000000 +; AMKNONE-NEXT: ret +; AMKNONE-NEXT: .LBB0_5: // %cleanup2 +; AMKNONE-NEXT: mov v0.16b, v1.16b +; AMKNONE-NEXT: ret entry: %cmp11 = icmp eq i64 %start, 0 br i1 %cmp11, label %cleanup2, label %for.body @@ -65,10 +138,11 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cbz x1, .LBB1_4 ; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: add x8, x0, #28 // =28 +; CHECK-NEXT: add x8, x0, x1, lsl #2 +; CHECK-NEXT: add x8, x8, #28 // =28 ; CHECK-NEXT: .LBB1_2: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr s1, [x8, x1, lsl #2] +; CHECK-NEXT: ldr s1, [x8] ; CHECK-NEXT: scvtf s2, x1 ; CHECK-NEXT: fadd s2, s2, s0 ; CHECK-NEXT: fcmp s1, s2 @@ -76,6 +150,7 @@ ; CHECK-NEXT: // %bb.3: // %for.cond ; CHECK-NEXT: // in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: add x1, x1, #1 // =1 +; CHECK-NEXT: add x8, x8, #4 // =4 ; CHECK-NEXT: cbnz x1, .LBB1_2 ; CHECK-NEXT: .LBB1_4: ; CHECK-NEXT: fmov s0, #-7.00000000 @@ -83,6 +158,77 @@ ; CHECK-NEXT: .LBB1_5: // %cleanup4 ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: ret +; +; POSTINDEXED-LABEL: test2: +; POSTINDEXED: // %bb.0: // %entry +; POSTINDEXED-NEXT: cbz x1, .LBB1_4 +; POSTINDEXED-NEXT: // %bb.1: // %for.body.preheader +; POSTINDEXED-NEXT: add x8, x0, x1, lsl #2 +; POSTINDEXED-NEXT: add x8, x8, #28 // =28 +; POSTINDEXED-NEXT: .LBB1_2: // %for.body +; POSTINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; POSTINDEXED-NEXT: ldr s1, [x8] +; POSTINDEXED-NEXT: scvtf s2, x1 +; POSTINDEXED-NEXT: fadd s2, s2, s0 +; POSTINDEXED-NEXT: fcmp s1, s2 +; POSTINDEXED-NEXT: b.gt .LBB1_5 +; POSTINDEXED-NEXT: // %bb.3: // %for.cond +; POSTINDEXED-NEXT: // in Loop: Header=BB1_2 Depth=1 +; POSTINDEXED-NEXT: add x1, x1, #1 // =1 +; POSTINDEXED-NEXT: add x8, x8, #4 // =4 +; POSTINDEXED-NEXT: cbnz x1, .LBB1_2 +; POSTINDEXED-NEXT: .LBB1_4: +; POSTINDEXED-NEXT: fmov s0, #-7.00000000 +; POSTINDEXED-NEXT: ret +; POSTINDEXED-NEXT: .LBB1_5: // %cleanup4 +; POSTINDEXED-NEXT: mov v0.16b, v1.16b +; POSTINDEXED-NEXT: ret +; +; PREINDEXED-LABEL: test2: +; PREINDEXED: // %bb.0: // %entry +; PREINDEXED-NEXT: cbz x1, .LBB1_4 +; PREINDEXED-NEXT: // %bb.1: // %for.body.preheader +; PREINDEXED-NEXT: add x8, x0, #28 // =28 +; PREINDEXED-NEXT: .LBB1_2: // %for.body +; PREINDEXED-NEXT: // =>This Inner Loop Header: Depth=1 +; PREINDEXED-NEXT: ldr s1, [x8, x1, lsl #2] +; PREINDEXED-NEXT: scvtf s2, x1 +; PREINDEXED-NEXT: fadd s2, s2, s0 +; PREINDEXED-NEXT: fcmp s1, s2 +; PREINDEXED-NEXT: b.gt .LBB1_5 +; PREINDEXED-NEXT: // %bb.3: // %for.cond +; PREINDEXED-NEXT: // in Loop: Header=BB1_2 Depth=1 +; PREINDEXED-NEXT: add x1, x1, #1 // =1 +; PREINDEXED-NEXT: cbnz x1, .LBB1_2 +; PREINDEXED-NEXT: .LBB1_4: +; PREINDEXED-NEXT: fmov s0, #-7.00000000 +; PREINDEXED-NEXT: ret +; PREINDEXED-NEXT: .LBB1_5: // %cleanup4 +; PREINDEXED-NEXT: mov v0.16b, v1.16b +; PREINDEXED-NEXT: ret +; +; AMKNONE-LABEL: test2: +; AMKNONE: // %bb.0: // %entry +; AMKNONE-NEXT: cbz x1, .LBB1_4 +; AMKNONE-NEXT: // %bb.1: // %for.body.preheader +; AMKNONE-NEXT: add x8, x0, #28 // =28 +; AMKNONE-NEXT: .LBB1_2: // %for.body +; AMKNONE-NEXT: // =>This Inner Loop Header: Depth=1 +; AMKNONE-NEXT: ldr s1, [x8, x1, lsl #2] +; AMKNONE-NEXT: scvtf s2, x1 +; AMKNONE-NEXT: fadd s2, s2, s0 +; AMKNONE-NEXT: fcmp s1, s2 +; AMKNONE-NEXT: b.gt .LBB1_5 +; AMKNONE-NEXT: // %bb.3: // %for.cond +; AMKNONE-NEXT: // in Loop: Header=BB1_2 Depth=1 +; AMKNONE-NEXT: add x1, x1, #1 // =1 +; AMKNONE-NEXT: cbnz x1, .LBB1_2 +; AMKNONE-NEXT: .LBB1_4: +; AMKNONE-NEXT: fmov s0, #-7.00000000 +; AMKNONE-NEXT: ret +; AMKNONE-NEXT: .LBB1_5: // %cleanup4 +; AMKNONE-NEXT: mov v0.16b, v1.16b +; AMKNONE-NEXT: ret entry: %cmp14 = icmp eq i64 %start, 0 br i1 %cmp14, label %cleanup4, label %for.body Index: llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-factor-out-constant.ll =================================================================== --- llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-factor-out-constant.ll +++ llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-factor-out-constant.ll @@ -1,15 +1,77 @@ -; RUN: opt -S -loop-reduce < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -loop-reduce < %s | FileCheck %s --check-prefix=CHECK +; RUN: opt -S -loop-reduce -lsr-preferred-addressing-mode=none < %s | FileCheck %s --check-prefix=AMKNONE target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" ; This test check SCEVExpander FactorOutConstant() is not crashing with blind cast 'Factor' to SCEVConstant. -; CHECK-LABEL: test ; FIXME: Handle VectorType in SCEVExpander::expandAddToGEP. ; The generated IR is not ideal with base 'scalar_vector' cast to i8*, and do ugly getelementptr over casted base. -; CHECK: uglygep +; define void @test(i32* %a, i32 %v, i64 %n) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SCALAR_VECTOR:%.*]] = alloca , align 16 +; CHECK-NEXT: [[NUM_ELM:%.*]] = call i64 @llvm.aarch64.sve.cntw(i32 31) +; CHECK-NEXT: [[SCALAR_COUNT:%.*]] = and i64 [[NUM_ELM]], -4 +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; CHECK: loop_header: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_LOOP:%.*]] ] +; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 0 +; CHECK-NEXT: [[GEP_VEC_0:%.*]] = getelementptr inbounds , * [[SCALAR_VECTOR]], i64 0, i64 0 +; CHECK-NEXT: br label [[SCALAR_LOOP:%.*]] +; CHECK: scalar_loop: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[SCALAR_LOOP]] ], [ [[SCALAR_COUNT]], [[LOOP_HEADER]] ] +; CHECK-NEXT: [[GEP_VEC:%.*]] = phi i32* [ [[GEP_VEC_0]], [[LOOP_HEADER]] ], [ [[GEP_VEC_INC:%.*]], [[SCALAR_LOOP]] ] +; CHECK-NEXT: store i32 [[V:%.*]], i32* [[GEP_VEC]], align 4 +; CHECK-NEXT: [[GEP_VEC_INC]] = getelementptr i32, i32* [[GEP_VEC]], i64 1 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1 +; CHECK-NEXT: [[SCALAR_EXIT:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[SCALAR_EXIT]], label [[FOR_LOOP]], label [[SCALAR_LOOP]] +; CHECK: for_loop: +; CHECK-NEXT: [[VECTOR:%.*]] = load , * [[SCALAR_VECTOR]], align 16 +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, i32* [[GEP_A_0]], i64 [[INDVAR]] +; CHECK-NEXT: [[VECTOR_PTR:%.*]] = bitcast i32* [[GEP_A]] to * +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[VECTOR]], * [[VECTOR_PTR]], i32 4, undef) +; CHECK-NEXT: [[INDVAR_NEXT]] = add nsw i64 [[INDVAR]], [[SCALAR_COUNT]] +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label [[EXIT:%.*]], label [[LOOP_HEADER]] +; +; AMKNONE-LABEL: @test( +; AMKNONE-NEXT: entry: +; AMKNONE-NEXT: [[SCALAR_VECTOR:%.*]] = alloca , align 16 +; AMKNONE-NEXT: [[SCALAR_VECTOR1:%.*]] = bitcast * [[SCALAR_VECTOR]] to i8* +; AMKNONE-NEXT: [[NUM_ELM:%.*]] = call i64 @llvm.aarch64.sve.cntw(i32 31) +; AMKNONE-NEXT: [[SCALAR_COUNT:%.*]] = and i64 [[NUM_ELM]], -4 +; AMKNONE-NEXT: br label [[LOOP_HEADER:%.*]] +; AMKNONE: exit: +; AMKNONE-NEXT: ret void +; AMKNONE: loop_header: +; AMKNONE-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_LOOP:%.*]] ] +; AMKNONE-NEXT: [[GEP_A_0:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 0 +; AMKNONE-NEXT: br label [[SCALAR_LOOP:%.*]] +; AMKNONE: scalar_loop: +; AMKNONE-NEXT: [[SCALAR_IV:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[SCALAR_IV_NEXT:%.*]], [[SCALAR_LOOP]] ] +; AMKNONE-NEXT: [[TMP0:%.*]] = shl i64 [[SCALAR_IV]], 2 +; AMKNONE-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[SCALAR_VECTOR1]], i64 [[TMP0]] +; AMKNONE-NEXT: [[UGLYGEP2:%.*]] = bitcast i8* [[UGLYGEP]] to i32* +; AMKNONE-NEXT: store i32 [[V:%.*]], i32* [[UGLYGEP2]], align 4 +; AMKNONE-NEXT: [[SCALAR_IV_NEXT]] = add i64 [[SCALAR_IV]], 1 +; AMKNONE-NEXT: [[SCALAR_EXIT:%.*]] = icmp eq i64 [[SCALAR_COUNT]], [[SCALAR_IV_NEXT]] +; AMKNONE-NEXT: br i1 [[SCALAR_EXIT]], label [[FOR_LOOP]], label [[SCALAR_LOOP]] +; AMKNONE: for_loop: +; AMKNONE-NEXT: [[VECTOR:%.*]] = load , * [[SCALAR_VECTOR]], align 16 +; AMKNONE-NEXT: [[GEP_A:%.*]] = getelementptr i32, i32* [[GEP_A_0]], i64 [[INDVAR]] +; AMKNONE-NEXT: [[VECTOR_PTR:%.*]] = bitcast i32* [[GEP_A]] to * +; AMKNONE-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32( [[VECTOR]], * [[VECTOR_PTR]], i32 4, undef) +; AMKNONE-NEXT: [[INDVAR_NEXT]] = add nsw i64 [[INDVAR]], [[SCALAR_COUNT]] +; AMKNONE-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[N:%.*]] +; AMKNONE-NEXT: br i1 [[EXIT_COND]], label [[EXIT:%.*]], label [[LOOP_HEADER]] +; entry: %scalar_vector = alloca , align 16 %num_elm = call i64 @llvm.aarch64.sve.cntw(i32 31)