Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -718,7 +718,7 @@ /// stride as collected by LoopVectorizationLegality::collectStridedAccess. const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, - Value *Ptr); + Type *AccessTy, Value *Ptr); /// If the pointer has a constant stride return it in units of the access type /// size. Otherwise return std::nullopt. Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -149,22 +149,34 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, - Value *Ptr) { + Type *AccessTy, Value *Ptr) { const SCEV *OrigSCEV = PSE.getSCEV(Ptr); // If there is an entry in the map return the SCEV of the pointer with the - // symbolic stride replaced by one. + // symbolic stride replaced by one unit. ValueToValueMap::const_iterator SI = PtrToStride.find(Ptr); if (SI == PtrToStride.end()) // For a non-symbolic stride, just return the original expression. return OrigSCEV; - Value *StrideVal = stripIntegerCast(SI->second); - ScalarEvolution *SE = PSE.getSE(); + Value *StrideVal = stripIntegerCast(SI->second); const auto *U = cast(SE->getSCEV(StrideVal)); + + // The stride unit may not be one such as the following IR: + // %addr = getelementptr i8, ptr %var, i64 %offset + // %val = load float, ptr %addr + uint64_t UnitVal = 1; + if (auto Gep = dyn_cast(Ptr)) { + const DataLayout &DL = SE->getDataLayout(); + uint64_t GepSize = DL.getTypeSizeInBits(Gep->getResultElementType()); + uint64_t accessSize = DL.getTypeSizeInBits(AccessTy); + if (GepSize < accessSize) + UnitVal = accessSize / GepSize; + } const auto *CT = - static_cast(SE->getOne(StrideVal->getType())); + static_cast(SE->getConstant(ConstantInt::get( + cast(StrideVal->getType()), UnitVal, true))); PSE.addPredicate(*SE->getEqualPredicate(U, CT)); auto *Expr = PSE.getSCEV(Ptr); @@ -951,7 +963,7 @@ static SmallVector> findForkedPointer(PredicatedScalarEvolution &PSE, - const ValueToValueMap &StridesMap, Value *Ptr, + const ValueToValueMap &StridesMap, Type *AccessTy, Value *Ptr, const Loop *L) { ScalarEvolution *SE = PSE.getSE(); assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!"); @@ -971,7 +983,7 @@ return Scevs; } - return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}}; + return {{replaceSymbolicStrideSCEV(PSE, StridesMap, AccessTy, Ptr), false}}; } bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, @@ -984,7 +996,7 @@ Value *Ptr = Access.getPointer(); SmallVector> TranslatedPtrs = - findForkedPointer(PSE, StridesMap, Ptr, TheLoop); + findForkedPointer(PSE, StridesMap, AccessTy, Ptr, TheLoop); for (auto &P : TranslatedPtrs) { const SCEV *PtrExpr = get<0>(P); @@ -1008,8 +1020,8 @@ // If there's only one option for Ptr, look it up after bounds and wrap // checking, because assumptions might have been added to PSE. if (TranslatedPtrs.size() == 1) - TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), - false}; + TranslatedPtrs[0] = { + replaceSymbolicStrideSCEV(PSE, StridesMap, AccessTy, Ptr), false}; } for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) { @@ -1380,7 +1392,8 @@ return std::nullopt; } - const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); + const SCEV *PtrScev = + replaceSymbolicStrideSCEV(PSE, StridesMap, AccessTy, Ptr); const SCEVAddRecExpr *AR = dyn_cast(PtrScev); if (Assume && !AR) Index: llvm/lib/Analysis/VectorUtils.cpp =================================================================== --- llvm/lib/Analysis/VectorUtils.cpp +++ llvm/lib/Analysis/VectorUtils.cpp @@ -1176,7 +1176,8 @@ getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides, /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0); - const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); + const SCEV *Scev = + replaceSymbolicStrideSCEV(PSE, Strides, ElementTy, Ptr); AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, getLoadStoreAlignment(&I)); } Index: llvm/test/Transforms/LoopVectorize/stride-accesses-unit-check-fix.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/stride-accesses-unit-check-fix.ll @@ -0,0 +1,102 @@ +;subroutine test(c, a, b, N) +; integer :: N +; real :: a(:), b(:) ! assumed-shape array +; real, pointer :: c(:) ! pointer array +; +; do i = 1, N +; c(i) = a(i) + b(i) +; end do +;end +; +;$ flang-new -fc1 -emit-llvm test.f90 ! generate LLVM IR +; +;Fortran IR for arrays such pointer array or assumed-shape array may have +;non-one stride. For example, the stride of the arrays `a`, `b`, `c` above +;is 4 bytes. Fortran IR for the arrays `a`, `b`, `c` uses descriptor like +;the following (See flang/include/flang/ISO_Fortran_binding.h): +;``` +;typedef struct CFI_cdesc_t { +; void *base_addr; +; size_t elem_len; +; int version; +; CFI_rank_t rank; +; CFI_type_t type; +; CFI_attribute_t attribute; +; unsigned char f18Addendum; +; CFI_dim_t dim[]; +;} CFI_cdesc_t; +;``` + +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: define void @test_ +; CHECK: vector.scevcheck: +; CHECK: %{{.*}} = icmp ne i64 %{{.*}}, 4 +; CHECK: %{{.*}} = icmp ne i64 %{{.*}}, 4 +; CHECK: %{{.*}} = icmp ne i64 %{{.*}}, 4 + +; CHECK: vector.body: +; CHECK: %[[OFFSET1:.*]] = mul i64 4, %{{.*}} +; CHECK: %{{.*}} = getelementptr i8, ptr %{{.*}}, i64 %[[OFFSET1]] +; CHECK: load <4 x float> +; CHECK: %[[OFFSET2:.*]] = mul i64 4, %{{.*}} +; CHECK: %{{.*}} = getelementptr i8, ptr %{{.*}}, i64 %[[OFFSET2]] +; CHECK: load <4 x float> +; CHECK: %[[OFFSET3:.*]] = mul i64 %{{.*}}, 4 +; CHECK: %{{.*}} = getelementptr i8, ptr %{{.*}}, i64 %[[OFFSET3]] +; CHECK: store <4 x float> + +define void @test_(ptr %0, ptr %1, ptr %2, ptr %3) { + %5 = load i32, ptr %3, align 4, !tbaa !1 + %6 = icmp sgt i32 %5, 0 + br i1 %6, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %4 + %7 = zext i32 %5 to i64 + %8 = load ptr, ptr %1, align 8, !tbaa !5 + %9 = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %1, i64 0, i32 7, i64 0, i64 2 + %10 = load i64, ptr %9, align 8, !tbaa !5 + %11 = load ptr, ptr %2, align 8, !tbaa !5 + %12 = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %2, i64 0, i32 7, i64 0, i64 2 + %13 = load i64, ptr %12, align 8, !tbaa !5 + %.unpack = load ptr, ptr %0, align 8, !tbaa !5 + %.elt15 = getelementptr inbounds { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %0, i64 0, i32 7 + %.unpack16.unpack.unpack = load i64, ptr %.elt15, align 8, !tbaa !5 + %.unpack16.unpack.elt20 = getelementptr inbounds { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %0, i64 0, i32 7, i64 0, i64 2 + %.unpack16.unpack.unpack21 = load i64, ptr %.unpack16.unpack.elt20, align 8, !tbaa !5 + br label %14 + +14: ; preds = %.lr.ph, %14 + %indvars.iv = phi i64 [ 1, %.lr.ph ], [ %indvars.iv.next, %14 ] + %15 = add nsw i64 %indvars.iv, -1 + %16 = mul i64 %10, %15 + %17 = getelementptr i8, ptr %8, i64 %16 + %18 = load float, ptr %17, align 4, !tbaa !1 + %19 = mul i64 %13, %15 + %20 = getelementptr i8, ptr %11, i64 %19 + %21 = load float, ptr %20, align 4, !tbaa !1 + %22 = fadd contract float %18, %21 + %23 = sub i64 %indvars.iv, %.unpack16.unpack.unpack + %24 = mul i64 %23, %.unpack16.unpack.unpack21 + %25 = getelementptr i8, ptr %.unpack, i64 %24 + store float %22, ptr %25, align 4, !tbaa !1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv, %7 + br i1 %exitcond.not, label %._crit_edge.loopexit, label %14 + +._crit_edge.loopexit: ; preds = %14 + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %4 + ret void +} + +!1 = !{!2, !2, i64 0} +!2 = !{!"any data access", !3, i64 0} +!3 = !{!"any access", !4, i64 0} +!4 = !{!"Flang Type TBAA Root"} +!5 = !{!6, !6, i64 0} +!6 = !{!"descriptor member", !3, i64 0} +