diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11803,8 +11803,12 @@ return false; // FIXME: Update this method to support scalable addressing modes. - if (isa(Ty)) - return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale; + if (isa(Ty)) { + uint64_t VecElemNumBytes = + DL.getTypeSizeInBits(cast(Ty)->getElementType()) / 8; + return AM.HasBaseReg && !AM.BaseOffs && + (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes); + } // check reg + imm case: // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -665,7 +665,7 @@ /// Return an expression for LHS /s RHS, if it can be determined and if the /// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits -/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that +/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that /// the multiplication may overflow, which is useful when the result will be /// used in a context where the most significant bits are ignored. static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, @@ -733,6 +733,18 @@ // Check for a multiply operand that we can pull RHS out of. if (const SCEVMulExpr *Mul = dyn_cast(LHS)) { if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { + // Handle special case C1*X*Y /s C2*X*Y. + if (const SCEVMulExpr *MulRHS = dyn_cast(RHS)) { + const SCEVConstant *LC = dyn_cast(Mul->getOperand(0)); + const SCEVConstant *RC = dyn_cast(MulRHS->getOperand(0)); + if (LC && RC) { + SmallVector LOps(drop_begin(Mul->operands())); + SmallVector ROps(drop_begin(MulRHS->operands())); + if (LOps == ROps) + return getExactSDiv(LC, RC, SE, IgnoreSignificantBits); + } + } + SmallVector Ops; bool Found = false; for (const SCEV *S : Mul->operands()) { diff --git a/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll b/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll --- a/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll +++ b/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll @@ -1,9 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -disable-lsr < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s ; Check that vscale call is recognised by load/store reg/reg pattern and -; partially folded, with the rest pulled out of the loop. This requires LSR to -; be disabled, which is something that will be addressed at a later date. +; partially folded, with the rest pulled out of the loop. define void @ld1w_reg_loop([32000 x i32]* %addr) { ; CHECK-LABEL: ld1w_reg_loop: diff --git a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll @@ -0,0 +1,114 @@ +; REQUIRES: asserts + +; RUN: opt -S -loop-reduce < %s -debug-only=loop-reduce 2>&1 | FileCheck %s --check-prefixes=LSR-DEBUG,IR +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefix=ASM + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linux-gnu" + +; LSR-DEBUG-LABEL: LSR on loop %loop +; Check that 2 is extracted as interesting factor for "(16*vscale) / (8*vscale)". +; LSR-DEBUG: LSR has identified the following interesting factors and types: *2 + +; Check that scaling factor 2 is used to generate candidate formula for address accesses. +; LSR-DEBUG: After generating reuse formulae: +; LSR-DEBUG: reg(%out) + 2*reg({0,+,(8 * %vscale)}<%loop>) +; LSR-DEBUG: reg(%in) + 2*reg({0,+,(8 * %vscale)}<%loop>) + +; LSR-DEBUG-LABEL: LSR on loop %loop_masked_ld_st +; Check that 2 is extracted as interesting factor for "(16*vscale) / (8*vscale)". +; LSR-DEBUG: LSR has identified the following interesting factors and types: *2 + +; Check that scaling factor 2 is used to generate candidate formula for address accesses. +; LSR-DEBUG: After generating reuse formulae: +; LSR-DEBUG: reg(%out) + 2*reg({0,+,(8 * %vscale)}<%loop_masked_ld_st>) +; LSR-DEBUG: reg(%in) + 2*reg({0,+,(8 * %vscale)}<%loop_masked_ld_st>) + + +; IR-LABEL: ld_st_nxv8i16 +; Check that %in and %out are not cast from i16* to i8*. +; IR-NOT: bitcast i16* {{.*}} to i8* + +; ASM-LABEL: ld_st_nxv8i16 +; Check that scaled-index addressing mode is leveraged. +; ASM: ld1h { z{{[0-9]+}}.h }, p{{[0-9]+}}/z, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #1] +; ASM: st1h { z{{[0-9]+}}.h }, p{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #1] + +define void @ld_st_nxv8i16(i16* %in, i16* %out) { +entry: + br label %loop.ph + +loop.ph: + %p_vec.splatinsert = insertelement undef, i16 3, i32 0 + %p_vec.splat = shufflevector %p_vec.splatinsert, undef, zeroinitializer + %vscale = call i64 @llvm.vscale.i64() + %scaled_vf = shl i64 %vscale, 3 + br label %loop + +loop: ; preds = %loop, %loop.ph + %indvar = phi i64 [ 0, %loop.ph ], [ %indvar.next, %loop ] + %ptr.in = getelementptr inbounds i16, i16* %in, i64 %indvar + %ptr.out = getelementptr inbounds i16, i16* %out, i64 %indvar + %in.ptrcast = bitcast i16* %ptr.in to * + %out.ptrcast = bitcast i16* %ptr.out to * + %val = load , * %in.ptrcast, align 16 + %addp_vec = add %val, %p_vec.splat + store %addp_vec, * %out.ptrcast, align 16 + %indvar.next = add nsw i64 %indvar, %scaled_vf + %exit.cond = icmp eq i64 %indvar.next, 1024 + br i1 %exit.cond, label %loop.exit, label %loop + +loop.exit: ; preds = %loop + br label %exit + +exit: + ret void +} + +; IR-LABEL: masked_ld_st_nxv8i16 +; Check that %in and %out are not cast from i16* to i8*. +; IR-NOT: bitcast i16* {{.*}} to i8* + +; ASM-LABEL: masked_ld_st_nxv8i16 +; Check that scaled-index addressing mode is leveraged. +; ASM: ld1h { z{{[0-9]+}}.h }, p{{[0-9]+}}/z, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #1] +; ASM: st1h { z{{[0-9]+}}.h }, p{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #1] + +define void @masked_ld_st_nxv8i16(i16* %in, i16* %out, i64 %n) { +entry: + br label %loop.ph + +loop.ph: + %p_vec.splatinsert = insertelement undef, i16 3, i32 0 + %p_vec.splat = shufflevector %p_vec.splatinsert, undef, zeroinitializer + %ptrue_vec.splatinsert = insertelement undef, i1 true, i32 0 + %ptrue_vec.splat = shufflevector %ptrue_vec.splatinsert, undef, zeroinitializer + %vscale = call i64 @llvm.vscale.i64() + %scaled_vf = shl i64 %vscale, 3 + br label %loop_masked_ld_st + +loop_masked_ld_st: ; preds = %loop_masked_ld_st, %loop.ph + %indvar = phi i64 [ 0, %loop.ph ], [ %indvar.next, %loop_masked_ld_st ] + %ptr.in = getelementptr inbounds i16, i16* %in, i64 %indvar + %ptr.out = getelementptr inbounds i16, i16* %out, i64 %indvar + %in.ptrcast = bitcast i16* %ptr.in to * + %out.ptrcast = bitcast i16* %ptr.out to * + %val = call @llvm.masked.load.nxv8i16.p0nxv8i16(* %in.ptrcast, i32 4, %ptrue_vec.splat, undef) + %addp_vec = add %val, %p_vec.splat + call void @llvm.masked.store.nxv8i16.p0nxv8i16( %addp_vec, * %out.ptrcast, i32 4, %ptrue_vec.splat) + %indvar.next = add nsw i64 %indvar, %scaled_vf + %exit.cond = icmp eq i64 %indvar.next, %n + br i1 %exit.cond, label %loop.exit, label %loop_masked_ld_st + +loop.exit: ; preds = %loop_masked_ld_st + br label %exit + +exit: + ret void +} + +declare i64 @llvm.vscale.i64() + +declare @llvm.masked.load.nxv8i16.p0nxv8i16(*, i32 immarg, , ) + +declare void @llvm.masked.store.nxv8i16.p0nxv8i16(, *, i32 immarg, )