Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2061,6 +2061,22 @@ Info.WriteMem = true; Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); break; + case Intrinsic::aarch64_sve_ld1: + case Intrinsic::aarch64_sve_ld2: + case Intrinsic::aarch64_sve_ld3: + case Intrinsic::aarch64_sve_ld4: + Info.ReadMem = true; + Info.WriteMem = false; + Info.PtrVal = Inst->getArgOperand(1); + break; + case Intrinsic::aarch64_sve_st1: + case Intrinsic::aarch64_sve_st2: + case Intrinsic::aarch64_sve_st3: + case Intrinsic::aarch64_sve_st4: + Info.ReadMem = false; + Info.WriteMem = true; + Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1); + break; } switch (Inst->getIntrinsicID()) { @@ -2078,6 +2094,15 @@ case Intrinsic::aarch64_neon_st4: Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; break; + case Intrinsic::aarch64_sve_ld1: + case Intrinsic::aarch64_sve_st1: + case Intrinsic::aarch64_sve_ld2: + case Intrinsic::aarch64_sve_st2: + case Intrinsic::aarch64_sve_ld3: + case Intrinsic::aarch64_sve_st3: + case Intrinsic::aarch64_sve_ld4: + case Intrinsic::aarch64_sve_st4: + break; } return true; } Index: llvm/test/Transforms/LoopStrengthReduce/AArch64/sve-load-store-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopStrengthReduce/AArch64/sve-load-store-intrinsics.ll @@ -0,0 +1,53 @@ +; RUN: opt < %s -loop-reduce -S 2>&1 | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-linux-gnu" + +define void @sve-load-store-intrinsics(i32* noalias nocapture %a, i32* %b, i32* %c, i64 %N) { +; check that after loop-reduce, sve.load reuses the original induction variable. + +; CHECK: %lsr.iv = phi i64 [ %lsr.iv.next, %for.body ], [ 0, %for.body.lr.ph ] +; CHECK: %scevgep4 = getelementptr i32, i32* %b, i64 %lsr.iv +; CHECK: %7 = tail call @llvm.aarch64.sve.ld1.nxv4i32( %6, i32* %scevgep4) +; CHECK: %scevgep3 = getelementptr i32, i32* %c, i64 %lsr.iv +; CHECK: %8 = tail call @llvm.aarch64.sve.ld1.nxv4i32( %6, i32* %scevgep3) +; CHECK: %9 = tail call @llvm.aarch64.sve.add.nxv4i32( %6, %7, %8) +; CHECK: tail call void @llvm.aarch64.sve.st1.nxv4i32( %9, %6, i32* %lsr.iv1) + +entry: + %0 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %1 = tail call @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64 0, i64 %N) + %2 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv4i1( %0, %1) + br i1 %2, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %3 = tail call i64 @llvm.vscale.i64() + %4 = shl nuw nsw i64 %3, 2 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %5 = phi [ %1, %for.body.lr.ph ], [ %9, %for.body ] + %i.06 = phi i64 [ 0, %for.body.lr.ph ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.06 + %6 = tail call @llvm.aarch64.sve.ld1.nxv4i32( %5, i32* %arrayidx) + %arrayidx1 = getelementptr inbounds i32, i32* %c, i64 %i.06 + %7 = tail call @llvm.aarch64.sve.ld1.nxv4i32( %5, i32* %arrayidx1) + %8 = tail call @llvm.aarch64.sve.add.nxv4i32( %5, %6, %7) + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %i.06 + tail call void @llvm.aarch64.sve.st1.nxv4i32( %8, %5, i32* %arrayidx2) + %add = add i64 %4, %i.06 + %9 = tail call @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64 %add, i64 %N) + %10 = tail call i1 @llvm.aarch64.sve.ptest.first.nxv4i1( %0, %9) + br i1 %10, label %for.body, label %for.cond.cleanup +} + +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 immarg) +declare @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64, i64) +declare @llvm.aarch64.sve.ld1.nxv4i32(, i32*) +declare @llvm.aarch64.sve.add.nxv4i32(, , ) +declare void @llvm.aarch64.sve.st1.nxv4i32(, , i32* nocapture) +declare i1 @llvm.aarch64.sve.ptest.first.nxv4i1(, ) +declare i64 @llvm.vscale.i64() +