Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -114,6 +114,8 @@ unsigned getMaxInterleaveFactor(unsigned VF); + bool shouldFavorPostInc() const; + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -82,6 +82,10 @@ return std::max(1, Cost); } +bool AArch64TTIImpl::shouldFavorPostInc() const { + return true; +} + int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Index: llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll +++ llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll @@ -10,17 +10,16 @@ define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) { ; CHECK-LABEL: mulDouble: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add x9, x0, #8 ; =8 -; CHECK-NEXT: add x10, x2, #16 ; =16 +; CHECK-NEXT: add x8, x0, #8 ; =8 +; CHECK-NEXT: add x9, x2, #16 ; =16 +; CHECK-NEXT: mov w10, #19 ; CHECK-NEXT: LBB0_1: ; %for.body ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr d0, [x1, x8] -; CHECK-NEXT: ldr d1, [x10, x8] +; CHECK-NEXT: ldr d0, [x1], #8 +; CHECK-NEXT: ldr d1, [x9], #8 +; CHECK-NEXT: subs w10, w10, #1 ; =1 ; CHECK-NEXT: fmul d0, d0, d1 -; CHECK-NEXT: str d0, [x9, x8] -; CHECK-NEXT: add x8, x8, #8 ; =8 -; CHECK-NEXT: cmp w8, #152 ; =152 +; CHECK-NEXT: str d0, [x8], #8 ; CHECK-NEXT: b.ne LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %for.end ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll =================================================================== --- llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -8,12 +8,11 @@ define dso_local void @run_test() local_unnamed_addr #0 { ; CHECK-LABEL: run_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #80 // =80 -; CHECK-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: stp d15, d14, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: .cfi_offset b8, -8 ; CHECK-NEXT: .cfi_offset b9, -16 ; CHECK-NEXT: .cfi_offset b10, -24 @@ -22,14 +21,13 @@ ; CHECK-NEXT: .cfi_offset b13, -48 ; CHECK-NEXT: .cfi_offset b14, -56 ; CHECK-NEXT: .cfi_offset b15, -64 +; CHECK-NEXT: adrp x9, A+128 ; CHECK-NEXT: adrp x10, B+48 -; CHECK-NEXT: adrp x11, A ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: add x9, x9, :lo12:A+128 ; CHECK-NEXT: add x10, x10, :lo12:B+48 -; CHECK-NEXT: add x11, x11, :lo12:A -; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-NEXT: mov w11, #8 ; CHECK-NEXT: // implicit-def: $q1 ; CHECK-NEXT: // implicit-def: $q2 ; CHECK-NEXT: // implicit-def: $q3 @@ -61,103 +59,96 @@ ; CHECK-NEXT: // implicit-def: $q13 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: mov x13, xzr +; CHECK-NEXT: ldr q14, [x13] ; CHECK-NEXT: ldr q15, [x8] -; CHECK-NEXT: ldr q14, [x12] -; CHECK-NEXT: ldr q0, [x10], #64 -; CHECK-NEXT: ldr x18, [x12] -; CHECK-NEXT: fmov x15, d15 -; CHECK-NEXT: mov x14, v15.d[1] -; CHECK-NEXT: fmov x13, d14 -; CHECK-NEXT: mul x1, x15, x18 -; CHECK-NEXT: mov x16, v0.d[1] -; CHECK-NEXT: fmov x17, d0 -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: mul x1, x14, x18 +; CHECK-NEXT: subs x11, x11, #1 // =1 ; CHECK-NEXT: mov x12, v14.d[1] -; CHECK-NEXT: ldr x0, [x8] -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: mul x1, x13, x18 -; CHECK-NEXT: add v12.2d, v12.2d, v0.2d -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: mul x1, x12, x18 -; CHECK-NEXT: mov v0.d[1], x1 -; CHECK-NEXT: mul x1, x17, x18 -; CHECK-NEXT: add v13.2d, v13.2d, v0.2d -; CHECK-NEXT: add v11.2d, v11.2d, v0.2d -; CHECK-NEXT: fmov d0, x1 -; CHECK-NEXT: mul x18, x16, x18 -; CHECK-NEXT: ldr q14, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x18, x15, x0 -; CHECK-NEXT: add x1, x11, x8 -; CHECK-NEXT: add v10.2d, v10.2d, v0.2d -; CHECK-NEXT: fmov d0, x18 -; CHECK-NEXT: mul x18, x14, x0 -; CHECK-NEXT: ldr x1, [x1, #128] -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x18, x13, x0 -; CHECK-NEXT: add v8.2d, v8.2d, v0.2d -; CHECK-NEXT: add v25.2d, v25.2d, v0.2d -; CHECK-NEXT: add v22.2d, v22.2d, v0.2d -; CHECK-NEXT: add v18.2d, v18.2d, v0.2d -; CHECK-NEXT: add v6.2d, v6.2d, v0.2d -; CHECK-NEXT: add v14.2d, v14.2d, v0.2d -; CHECK-NEXT: fmov d0, x18 -; CHECK-NEXT: mul x18, x12, x0 -; CHECK-NEXT: mov v0.d[1], x18 -; CHECK-NEXT: mul x18, x17, x0 -; CHECK-NEXT: mul x0, x16, x0 -; CHECK-NEXT: add v9.2d, v9.2d, v0.2d -; CHECK-NEXT: add v31.2d, v31.2d, v0.2d -; CHECK-NEXT: add v26.2d, v26.2d, v0.2d -; CHECK-NEXT: add v23.2d, v23.2d, v0.2d -; CHECK-NEXT: add v21.2d, v21.2d, v0.2d -; CHECK-NEXT: add v19.2d, v19.2d, v0.2d -; CHECK-NEXT: add v17.2d, v17.2d, v0.2d -; CHECK-NEXT: add v7.2d, v7.2d, v0.2d -; CHECK-NEXT: add v5.2d, v5.2d, v0.2d -; CHECK-NEXT: add v3.2d, v3.2d, v0.2d -; CHECK-NEXT: add v2.2d, v2.2d, v0.2d -; CHECK-NEXT: fmov d0, x18 -; CHECK-NEXT: mul x15, x15, x1 -; CHECK-NEXT: mov v0.d[1], x0 -; CHECK-NEXT: mul x14, x14, x1 -; CHECK-NEXT: add v30.2d, v30.2d, v0.2d -; CHECK-NEXT: add v24.2d, v24.2d, v0.2d -; CHECK-NEXT: add v20.2d, v20.2d, v0.2d -; CHECK-NEXT: add v16.2d, v16.2d, v0.2d -; CHECK-NEXT: add v4.2d, v4.2d, v0.2d -; CHECK-NEXT: add v1.2d, v1.2d, v0.2d -; CHECK-NEXT: fmov d0, x15 -; CHECK-NEXT: mul x13, x13, x1 -; CHECK-NEXT: mov v0.d[1], x14 -; CHECK-NEXT: mul x12, x12, x1 -; CHECK-NEXT: add v29.2d, v29.2d, v0.2d -; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: mul x17, x17, x1 -; CHECK-NEXT: mov v0.d[1], x12 -; CHECK-NEXT: mul x16, x16, x1 -; CHECK-NEXT: add v28.2d, v28.2d, v0.2d -; CHECK-NEXT: fmov d0, x17 -; CHECK-NEXT: mov v0.d[1], x16 -; CHECK-NEXT: add x8, x8, #8 // =8 -; CHECK-NEXT: add v27.2d, v27.2d, v0.2d -; CHECK-NEXT: cmp x8, #64 // =64 -; CHECK-NEXT: add x9, x9, #1 // =1 -; CHECK-NEXT: str q14, [sp] // 16-byte Folded Spill +; CHECK-NEXT: fmov x14, d14 +; CHECK-NEXT: ldr q14, [x10], #64 +; CHECK-NEXT: ldr x13, [x13] +; CHECK-NEXT: fmov x16, d15 +; CHECK-NEXT: mov x15, v15.d[1] +; CHECK-NEXT: mov x1, v14.d[1] +; CHECK-NEXT: mul x0, x16, x13 +; CHECK-NEXT: fmov x2, d14 +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x0, x15, x13 +; CHECK-NEXT: ldr x17, [x8], #1 +; CHECK-NEXT: ldr x18, [x9], #8 +; CHECK-NEXT: mov v14.d[1], x0 +; CHECK-NEXT: mul x0, x14, x13 +; CHECK-NEXT: add v12.2d, v12.2d, v14.2d +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x0, x12, x13 +; CHECK-NEXT: mov v14.d[1], x0 +; CHECK-NEXT: mul x0, x2, x13 +; CHECK-NEXT: add v13.2d, v13.2d, v14.2d +; CHECK-NEXT: add v11.2d, v11.2d, v14.2d +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x13, x1, x13 +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: mul x13, x16, x18 +; CHECK-NEXT: add v10.2d, v10.2d, v14.2d +; CHECK-NEXT: fmov d14, x13 +; CHECK-NEXT: mul x13, x15, x18 +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: mul x13, x14, x17 +; CHECK-NEXT: mul x14, x14, x18 +; CHECK-NEXT: add v29.2d, v29.2d, v14.2d +; CHECK-NEXT: fmov d14, x14 +; CHECK-NEXT: mul x14, x12, x18 +; CHECK-NEXT: mov v14.d[1], x14 +; CHECK-NEXT: mul x0, x2, x18 +; CHECK-NEXT: mul x18, x1, x18 +; CHECK-NEXT: add v28.2d, v28.2d, v14.2d +; CHECK-NEXT: fmov d14, x0 +; CHECK-NEXT: mul x16, x16, x17 +; CHECK-NEXT: mov v14.d[1], x18 +; CHECK-NEXT: mul x15, x15, x17 +; CHECK-NEXT: add v27.2d, v27.2d, v14.2d +; CHECK-NEXT: fmov d14, x16 +; CHECK-NEXT: mov v14.d[1], x15 +; CHECK-NEXT: mul x12, x12, x17 +; CHECK-NEXT: add v8.2d, v8.2d, v14.2d +; CHECK-NEXT: add v25.2d, v25.2d, v14.2d +; CHECK-NEXT: add v22.2d, v22.2d, v14.2d +; CHECK-NEXT: add v18.2d, v18.2d, v14.2d +; CHECK-NEXT: add v6.2d, v6.2d, v14.2d +; CHECK-NEXT: add v0.2d, v0.2d, v14.2d +; CHECK-NEXT: fmov d14, x13 +; CHECK-NEXT: mul x14, x2, x17 +; CHECK-NEXT: mov v14.d[1], x12 +; CHECK-NEXT: mul x13, x1, x17 +; CHECK-NEXT: add v9.2d, v9.2d, v14.2d +; CHECK-NEXT: add v31.2d, v31.2d, v14.2d +; CHECK-NEXT: add v26.2d, v26.2d, v14.2d +; CHECK-NEXT: add v23.2d, v23.2d, v14.2d +; CHECK-NEXT: add v21.2d, v21.2d, v14.2d +; CHECK-NEXT: add v19.2d, v19.2d, v14.2d +; CHECK-NEXT: add v17.2d, v17.2d, v14.2d +; CHECK-NEXT: add v7.2d, v7.2d, v14.2d +; CHECK-NEXT: add v5.2d, v5.2d, v14.2d +; CHECK-NEXT: add v3.2d, v3.2d, v14.2d +; CHECK-NEXT: add v2.2d, v2.2d, v14.2d +; CHECK-NEXT: fmov d14, x14 +; CHECK-NEXT: mov v14.d[1], x13 +; CHECK-NEXT: add v30.2d, v30.2d, v14.2d +; CHECK-NEXT: add v24.2d, v24.2d, v14.2d +; CHECK-NEXT: add v20.2d, v20.2d, v14.2d +; CHECK-NEXT: add v16.2d, v16.2d, v14.2d +; CHECK-NEXT: add v4.2d, v4.2d, v14.2d +; CHECK-NEXT: add v1.2d, v1.2d, v14.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: stp q13, q12, [x8] ; CHECK-NEXT: stp q11, q10, [x8, #32] ; CHECK-NEXT: stp q9, q8, [x8, #64] -; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: stp q31, q30, [x8, #96] ; CHECK-NEXT: stp q29, q28, [x8, #144] ; CHECK-NEXT: stp q27, q26, [x8, #176] @@ -171,7 +162,7 @@ ; CHECK-NEXT: stp q4, q3, [x8, #432] ; CHECK-NEXT: stp q0, q2, [x8, #464] ; CHECK-NEXT: str q1, [x8, #496] -; CHECK-NEXT: add sp, sp, #80 // =80 +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: br label %for.cond1.preheader Index: llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll =================================================================== --- llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll +++ llvm/test/CodeGen/AArch64/shrink-wrapping-vla.ll @@ -81,12 +81,14 @@ ; CHECK-NEXT: mov x29, sp ; VLA allocation -; CHECK: add [[X1:x[0-9]+]], [[X1]], #15 -; CHECK: mov [[X2:x[0-9]+]], sp -; CHECK: and [[X1]], [[X1]], #0x7fffffff0 +; CHECK: mov [[W8:w[0-9]+]], w0 +; CHECK: lsl [[X1:x[0-9]+]], x8, #2 +; CHECK: add [[X2:x[0-9]+]], [[X1]], #15 +; CHECK: mov [[X3:x[0-9]+]], sp +; CHECK: and [[X2]], [[X2]], #0x7fffffff0 ; Saving the SP via llvm.stacksave() ; CHECK: mov [[SAVE:x[0-9]+]], sp -; CHECK: sub [[X2]], [[X2]], [[X1]] +; CHECK: sub [[X3]], [[X3]], [[X2]] ; The next instruction comes from llvm.stackrestore() ; CHECK: mov sp, [[SAVE]]