diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11803,8 +11803,12 @@ return false; // FIXME: Update this method to support scalable addressing modes. - if (isa(Ty)) - return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale; + if (isa(Ty)) { + uint64_t VecElemNumBytes = + DL.getTypeSizeInBits(cast(Ty)->getElementType()) / 8; + return AM.HasBaseReg && !AM.BaseOffs && + (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes); + } // check reg + imm case: // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -665,7 +665,7 @@ /// Return an expression for LHS /s RHS, if it can be determined and if the /// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits -/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that +/// is true, expressions like (X * Y) /s Y are simplified to X, ignoring that /// the multiplication may overflow, which is useful when the result will be /// used in a context where the most significant bits are ignored. static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, @@ -733,6 +733,21 @@ // Check for a multiply operand that we can pull RHS out of. if (const SCEVMulExpr *Mul = dyn_cast(LHS)) { if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) { + // Handle special case C1*X*Y /s C2*X*Y. + if (const SCEVMulExpr *MulRHS = dyn_cast(RHS)) { + if (IgnoreSignificantBits || isMulSExtable(MulRHS, SE)) { + const SCEVConstant *LC = dyn_cast(Mul->getOperand(0)); + const SCEVConstant *RC = + dyn_cast(MulRHS->getOperand(0)); + if (LC && RC) { + SmallVector LOps(drop_begin(Mul->operands())); + SmallVector ROps(drop_begin(MulRHS->operands())); + if (LOps == ROps) + return getExactSDiv(LC, RC, SE, IgnoreSignificantBits); + } + } + } + SmallVector Ops; bool Found = false; for (const SCEV *S : Mul->operands()) { diff --git a/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll b/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll --- a/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll +++ b/llvm/test/CodeGen/AArch64/sve-fold-vscale.ll @@ -1,9 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -disable-lsr < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s ; Check that vscale call is recognised by load/store reg/reg pattern and -; partially folded, with the rest pulled out of the loop. This requires LSR to -; be disabled, which is something that will be addressed at a later date. +; partially folded, with the rest pulled out of the loop. define void @ld1w_reg_loop([32000 x i32]* %addr) { ; CHECK-LABEL: ld1w_reg_loop: diff --git a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll @@ -0,0 +1,165 @@ +; RUN: opt -S -loop-reduce < %s | FileCheck %s --check-prefix=IR +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefix=ASM +; Note: To update this test, please run utils/update_test_checks.py and utils/update_llc_test_checks.py separately on opt/llc run line. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-linux-gnu" + +; These tests check that the IR coming out of LSR does not cast input/output pointer from i16* to i8* type. +; And scaled-index addressing mode is leveraged in the generated assembly, i.e. ld1h { z1.h }, p0/z, [x0, x8, lsl #1]. + +define void @ld_st_nxv8i16(i16* %in, i16* %out) { +; IR-LABEL: @ld_st_nxv8i16( +; IR-NEXT: entry: +; IR-NEXT: br label [[LOOP_PH:%.*]] +; IR: loop.ph: +; IR-NEXT: [[P_VEC_SPLATINSERT:%.*]] = insertelement undef, i16 3, i32 0 +; IR-NEXT: [[P_VEC_SPLAT:%.*]] = shufflevector [[P_VEC_SPLATINSERT]], undef, zeroinitializer +; IR-NEXT: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; IR-NEXT: [[SCALED_VF:%.*]] = shl i64 [[VSCALE]], 3 +; IR-NEXT: br label [[LOOP:%.*]] +; IR: loop: +; IR-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[LOOP_PH]] ], [ [[INDVAR_NEXT:%.*]], [[LOOP]] ] +; IR-NEXT: [[SCEVGEP2:%.*]] = getelementptr i16, i16* [[IN:%.*]], i64 [[INDVAR]] +; IR-NEXT: [[SCEVGEP23:%.*]] = bitcast i16* [[SCEVGEP2]] to * +; IR-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[OUT:%.*]], i64 [[INDVAR]] +; IR-NEXT: [[SCEVGEP1:%.*]] = bitcast i16* [[SCEVGEP]] to * +; IR-NEXT: [[VAL:%.*]] = load , * [[SCEVGEP23]], align 16 +; IR-NEXT: [[ADDP_VEC:%.*]] = add [[VAL]], [[P_VEC_SPLAT]] +; IR-NEXT: store [[ADDP_VEC]], * [[SCEVGEP1]], align 16 +; IR-NEXT: [[INDVAR_NEXT]] = add nsw i64 [[INDVAR]], [[SCALED_VF]] +; IR-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 1024 +; IR-NEXT: br i1 [[EXIT_COND]], label [[LOOP_EXIT:%.*]], label [[LOOP]] +; IR: loop.exit: +; IR-NEXT: br label [[EXIT:%.*]] +; IR: exit: +; IR-NEXT: ret void +; +; ASM-LABEL: ld_st_nxv8i16: +; ASM: // %bb.0: // %entry +; ASM-NEXT: mov x8, xzr +; ASM-NEXT: mov z0.h, #3 // =0x3 +; ASM-NEXT: cnth x9 +; ASM-NEXT: ptrue p0.h +; ASM-NEXT: .LBB0_1: // %loop +; ASM-NEXT: // =>This Inner Loop Header: Depth=1 +; ASM-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] +; ASM-NEXT: add z1.h, z1.h, z0.h +; ASM-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; ASM-NEXT: add x8, x8, x9 +; ASM-NEXT: cmp x8, #1024 // =1024 +; ASM-NEXT: b.ne .LBB0_1 +; ASM-NEXT: // %bb.2: // %exit +; ASM-NEXT: ret +entry: + br label %loop.ph + +loop.ph: + %p_vec.splatinsert = insertelement undef, i16 3, i32 0 + %p_vec.splat = shufflevector %p_vec.splatinsert, undef, zeroinitializer + %vscale = call i64 @llvm.vscale.i64() + %scaled_vf = shl i64 %vscale, 3 + br label %loop + +loop: ; preds = %loop, %loop.ph + %indvar = phi i64 [ 0, %loop.ph ], [ %indvar.next, %loop ] + %ptr.in = getelementptr inbounds i16, i16* %in, i64 %indvar + %ptr.out = getelementptr inbounds i16, i16* %out, i64 %indvar + %in.ptrcast = bitcast i16* %ptr.in to * + %out.ptrcast = bitcast i16* %ptr.out to * + %val = load , * %in.ptrcast, align 16 + %addp_vec = add %val, %p_vec.splat + store %addp_vec, * %out.ptrcast, align 16 + %indvar.next = add nsw i64 %indvar, %scaled_vf + %exit.cond = icmp eq i64 %indvar.next, 1024 + br i1 %exit.cond, label %loop.exit, label %loop + +loop.exit: ; preds = %loop + br label %exit + +exit: + ret void +} + +define void @masked_ld_st_nxv8i16(i16* %in, i16* %out, i64 %n) { +; IR-LABEL: @masked_ld_st_nxv8i16( +; IR-NEXT: entry: +; IR-NEXT: br label [[LOOP_PH:%.*]] +; IR: loop.ph: +; IR-NEXT: [[P_VEC_SPLATINSERT:%.*]] = insertelement undef, i16 3, i32 0 +; IR-NEXT: [[P_VEC_SPLAT:%.*]] = shufflevector [[P_VEC_SPLATINSERT]], undef, zeroinitializer +; IR-NEXT: [[PTRUE_VEC_SPLATINSERT:%.*]] = insertelement undef, i1 true, i32 0 +; IR-NEXT: [[PTRUE_VEC_SPLAT:%.*]] = shufflevector [[PTRUE_VEC_SPLATINSERT]], undef, zeroinitializer +; IR-NEXT: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() +; IR-NEXT: [[SCALED_VF:%.*]] = shl i64 [[VSCALE]], 3 +; IR-NEXT: br label [[LOOP:%.*]] +; IR: loop: +; IR-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[LOOP_PH]] ], [ [[INDVAR_NEXT:%.*]], [[LOOP]] ] +; IR-NEXT: [[SCEVGEP2:%.*]] = getelementptr i16, i16* [[IN:%.*]], i64 [[INDVAR]] +; IR-NEXT: [[SCEVGEP23:%.*]] = bitcast i16* [[SCEVGEP2]] to * +; IR-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[OUT:%.*]], i64 [[INDVAR]] +; IR-NEXT: [[SCEVGEP1:%.*]] = bitcast i16* [[SCEVGEP]] to * +; IR-NEXT: [[VAL:%.*]] = call @llvm.masked.load.nxv8i16.p0nxv8i16(* [[SCEVGEP23]], i32 4, [[PTRUE_VEC_SPLAT]], undef) +; IR-NEXT: [[ADDP_VEC:%.*]] = add [[VAL]], [[P_VEC_SPLAT]] +; IR-NEXT: call void @llvm.masked.store.nxv8i16.p0nxv8i16( [[ADDP_VEC]], * [[SCEVGEP1]], i32 4, [[PTRUE_VEC_SPLAT]]) +; IR-NEXT: [[INDVAR_NEXT]] = add nsw i64 [[INDVAR]], [[SCALED_VF]] +; IR-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[N:%.*]], [[INDVAR_NEXT]] +; IR-NEXT: br i1 [[EXIT_COND]], label [[LOOP_EXIT:%.*]], label [[LOOP]] +; IR: loop.exit: +; IR-NEXT: br label [[EXIT:%.*]] +; IR: exit: +; IR-NEXT: ret void +; +; ASM-LABEL: masked_ld_st_nxv8i16: +; ASM: // %bb.0: // %entry +; ASM-NEXT: mov x8, xzr +; ASM-NEXT: mov z0.h, #3 // =0x3 +; ASM-NEXT: ptrue p0.h +; ASM-NEXT: cnth x9 +; ASM-NEXT: .LBB1_1: // %loop +; ASM-NEXT: // =>This Inner Loop Header: Depth=1 +; ASM-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] +; ASM-NEXT: add z1.h, z1.h, z0.h +; ASM-NEXT: st1h { z1.h }, p0, [x1, x8, lsl #1] +; ASM-NEXT: add x8, x8, x9 +; ASM-NEXT: cmp x2, x8 +; ASM-NEXT: b.ne .LBB1_1 +; ASM-NEXT: // %bb.2: // %exit +; ASM-NEXT: ret +entry: + br label %loop.ph + +loop.ph: + %p_vec.splatinsert = insertelement undef, i16 3, i32 0 + %p_vec.splat = shufflevector %p_vec.splatinsert, undef, zeroinitializer + %ptrue_vec.splatinsert = insertelement undef, i1 true, i32 0 + %ptrue_vec.splat = shufflevector %ptrue_vec.splatinsert, undef, zeroinitializer + %vscale = call i64 @llvm.vscale.i64() + %scaled_vf = shl i64 %vscale, 3 + br label %loop + +loop: ; preds = %loop, %loop.ph + %indvar = phi i64 [ 0, %loop.ph ], [ %indvar.next, %loop ] + %ptr.in = getelementptr inbounds i16, i16* %in, i64 %indvar + %ptr.out = getelementptr inbounds i16, i16* %out, i64 %indvar + %in.ptrcast = bitcast i16* %ptr.in to * + %out.ptrcast = bitcast i16* %ptr.out to * + %val = call @llvm.masked.load.nxv8i16.p0nxv8i16(* %in.ptrcast, i32 4, %ptrue_vec.splat, undef) + %addp_vec = add %val, %p_vec.splat + call void @llvm.masked.store.nxv8i16.p0nxv8i16( %addp_vec, * %out.ptrcast, i32 4, %ptrue_vec.splat) + %indvar.next = add nsw i64 %indvar, %scaled_vf + %exit.cond = icmp eq i64 %indvar.next, %n + br i1 %exit.cond, label %loop.exit, label %loop + +loop.exit: ; preds = %loop + br label %exit + +exit: + ret void +} + +declare i64 @llvm.vscale.i64() + +declare @llvm.masked.load.nxv8i16.p0nxv8i16(*, i32 immarg, , ) + +declare void @llvm.masked.store.nxv8i16.p0nxv8i16(, *, i32 immarg, )