Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -718,7 +718,7 @@ /// stride as collected by LoopVectorizationLegality::collectStridedAccess. const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, - Value *Ptr); + Type *AccessTy, Value *Ptr); /// If the pointer has a constant stride return it in units of the access type /// size. Otherwise return std::nullopt. Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -150,11 +150,11 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const ValueToValueMap &PtrToStride, - Value *Ptr) { + Type *AccessTy, Value *Ptr) { const SCEV *OrigSCEV = PSE.getSCEV(Ptr); // If there is an entry in the map return the SCEV of the pointer with the - // symbolic stride replaced by one. + // symbolic stride replaced by one unit. ValueToValueMap::const_iterator SI = PtrToStride.find(Ptr); if (SI == PtrToStride.end()) // For a non-symbolic stride, just return the original expression. @@ -166,7 +166,22 @@ const SCEV *StrideSCEV = SE->getSCEV(StrideVal); assert(isa(StrideSCEV) && "shouldn't be in map"); - const auto *CT = SE->getOne(StrideSCEV->getType()); + // The stride unit may not be one such as the following IR: + // %addr = getelementptr i8, ptr %var, i64 %offset + // %val = load float, ptr %addr + auto getStrideSize = [=]() -> uint64_t { + if (const auto *Gep = dyn_cast(Ptr)) { + const DataLayout &DL = SE->getDataLayout(); + uint64_t GepSize = DL.getTypeSizeInBits(Gep->getResultElementType()); + uint64_t accessSize = DL.getTypeSizeInBits(AccessTy); + if (GepSize < accessSize) + return accessSize / GepSize; + } + return 1; + }; + const auto *CT = static_cast( + SE->getConstant(StrideVal->getType(), getStrideSize())); + PSE.addPredicate(*SE->getEqualPredicate(StrideSCEV, CT)); auto *Expr = PSE.getSCEV(Ptr); @@ -952,7 +967,7 @@ static SmallVector> findForkedPointer(PredicatedScalarEvolution &PSE, - const ValueToValueMap &StridesMap, Value *Ptr, + const ValueToValueMap &StridesMap, Type *AccessTy, Value *Ptr, const Loop *L) { ScalarEvolution *SE = PSE.getSE(); assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!"); @@ -972,7 +987,7 @@ return Scevs; } - return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}}; + return {{replaceSymbolicStrideSCEV(PSE, StridesMap, AccessTy, Ptr), false}}; } bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, @@ -985,7 +1000,7 @@ Value *Ptr = Access.getPointer(); SmallVector> TranslatedPtrs = - findForkedPointer(PSE, StridesMap, Ptr, TheLoop); + findForkedPointer(PSE, StridesMap, AccessTy, Ptr, TheLoop); for (auto &P : TranslatedPtrs) { const SCEV *PtrExpr = get<0>(P); @@ -1009,8 +1024,8 @@ // If there's only one option for Ptr, look it up after bounds and wrap // checking, because assumptions might have been added to PSE. if (TranslatedPtrs.size() == 1) - TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), - false}; + TranslatedPtrs[0] = { + replaceSymbolicStrideSCEV(PSE, StridesMap, AccessTy, Ptr), false}; } for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) { @@ -1381,7 +1396,8 @@ return std::nullopt; } - const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); + const SCEV *PtrScev = + replaceSymbolicStrideSCEV(PSE, StridesMap, AccessTy, Ptr); const SCEVAddRecExpr *AR = dyn_cast(PtrScev); if (Assume && !AR) Index: llvm/lib/Analysis/VectorUtils.cpp =================================================================== --- llvm/lib/Analysis/VectorUtils.cpp +++ llvm/lib/Analysis/VectorUtils.cpp @@ -1042,7 +1042,8 @@ getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides, /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0); - const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr); + const SCEV *Scev = + replaceSymbolicStrideSCEV(PSE, Strides, ElementTy, Ptr); AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, getLoadStoreAlignment(&I)); } Index: llvm/test/Transforms/LoopVectorize/stride-accesses-unit-check-fix.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/stride-accesses-unit-check-fix.ll @@ -0,0 +1,319 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +;subroutine test(c, a, b, N) +; integer :: N +; real :: a(:), b(:) ! assumed-shape array +; real, pointer :: c(:) ! pointer array +; +; do i = 1, N +; c(i) = a(i) + b(i) +; end do +;end +; +;$ flang-new -fc1 -emit-llvm test.f90 ! generate LLVM IR +; +;Fortran IR for arrays such pointer array or assumed-shape array may have +;non-one stride. For example, the stride of the arrays `a`, `b`, `c` above +;is 4 bytes. Fortran IR for the arrays `a`, `b`, `c` uses descriptor like +;the following (See flang/include/flang/ISO_Fortran_binding.h): +;``` +;typedef struct CFI_cdesc_t { +; void *base_addr; +; size_t elem_len; +; int version; +; CFI_rank_t rank; +; CFI_type_t type; +; CFI_attribute_t attribute; +; unsigned char f18Addendum; +; CFI_dim_t dim[]; +;} CFI_cdesc_t; +;``` + +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +define void @test_(ptr %0, ptr %1, ptr %2, ptr %N) { +; CHECK-LABEL: @test_( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[LOOP_PREHEADER:%.*]], label [[LOOP_EXIT:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[A_ADDR:%.*]] = load ptr, ptr [[TMP1:%.*]], align 8 +; CHECK-NEXT: [[A_ADDR4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64 +; CHECK-NEXT: [[A_STRIDE_ADDR:%.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr [[TMP1]], i64 0, i32 7, i64 0, i64 2 +; CHECK-NEXT: [[A_STRIDE:%.*]] = load i64, ptr [[A_STRIDE_ADDR]], align 8 +; CHECK-NEXT: [[B_ADDR:%.*]] = load ptr, ptr [[TMP2:%.*]], align 8 +; CHECK-NEXT: [[B_ADDR5:%.*]] = ptrtoint ptr [[B_ADDR]] to i64 +; CHECK-NEXT: [[B_STRIDE_ADDR:%.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr [[TMP2]], i64 0, i32 7, i64 0, i64 2 +; CHECK-NEXT: [[B_STRIDE:%.*]] = load i64, ptr [[B_STRIDE_ADDR]], align 8 +; CHECK-NEXT: [[C_ADDR:%.*]] = load ptr, ptr [[TMP0:%.*]], align 8 +; CHECK-NEXT: [[C_ADDR3:%.*]] = ptrtoint ptr [[C_ADDR]] to i64 +; CHECK-NEXT: [[C_LOWERBOUND_ADDR:%.*]] = getelementptr inbounds { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr [[TMP0]], i64 0, i32 7 +; CHECK-NEXT: [[C_LOWERBOUND:%.*]] = load i64, ptr [[C_LOWERBOUND_ADDR]], align 8 +; CHECK-NEXT: [[C_STRIDE_ADDR:%.*]] = getelementptr inbounds { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr [[TMP0]], i64 0, i32 7, i64 0, i64 2 +; CHECK-NEXT: [[C_STRIDE:%.*]] = load i64, ptr [[C_STRIDE_ADDR]], align 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP5]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[C_STRIDE]], 4 +; CHECK-NEXT: [[IDENT_CHECK1:%.*]] = icmp ne i64 [[A_STRIDE]], 4 +; CHECK-NEXT: [[IDENT_CHECK2:%.*]] = icmp ne i64 [[B_STRIDE]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK1]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[IDENT_CHECK2]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[C_ADDR3]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[C_LOWERBOUND]], 2 +; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP10]], [[A_ADDR4]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP11]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 [[TMP10]], [[B_ADDR5]] +; CHECK-NEXT: [[DIFF_CHECK6:%.*]] = icmp ult i64 [[TMP12]], 16 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK6]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP5]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 1, [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = add nsw i64 [[TMP13]], -1 +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[A_STRIDE]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A_ADDR]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr float, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP17]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[B_STRIDE]], [[TMP14]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[B_ADDR]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr float, ptr [[TMP19]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP20]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = fadd contract <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD7]] +; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[TMP13]], [[C_LOWERBOUND]] +; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], [[C_STRIDE]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[C_ADDR]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr float, ptr [[TMP24]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP21]], ptr [[TMP25]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[LOOP_PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK: loop.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP_BODY]] ] +; CHECK-NEXT: [[TMP27:%.*]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[A_STRIDE]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[A_ADDR]], i64 [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = load float, ptr [[TMP29]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = mul i64 [[B_STRIDE]], [[TMP27]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[B_ADDR]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4 +; CHECK-NEXT: [[TMP34:%.*]] = fadd contract float [[TMP30]], [[TMP33]] +; CHECK-NEXT: [[TMP35:%.*]] = sub i64 [[INDVARS_IV]], [[C_LOWERBOUND]] +; CHECK-NEXT: [[TMP36:%.*]] = mul i64 [[TMP35]], [[C_STRIDE]] +; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i8, ptr [[C_ADDR]], i64 [[TMP36]] +; CHECK-NEXT: store float [[TMP34]], ptr [[TMP37]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP5]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LOOPEXIT]], label [[LOOP_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: loop.exit.loopexit: +; CHECK-NEXT: br label [[LOOP_EXIT]] +; CHECK: loop.exit: +; CHECK-NEXT: ret void +; +entry: + %3 = load i32, ptr %N, align 4 + %4 = icmp sgt i32 %3, 0 + br i1 %4, label %loop.preheader, label %loop.exit + +loop.preheader: ; preds = %entry + %5 = zext i32 %3 to i64 + %a_addr = load ptr, ptr %1, align 8 + %a_stride_addr = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %1, i64 0, i32 7, i64 0, i64 2 + %a_stride = load i64, ptr %a_stride_addr, align 8 + %b_addr = load ptr, ptr %2, align 8 + %b_stride_addr = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %2, i64 0, i32 7, i64 0, i64 2 + %b_stride = load i64, ptr %b_stride_addr, align 8 + %c_addr = load ptr, ptr %0, align 8 + %c_lowerbound_addr = getelementptr inbounds { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %0, i64 0, i32 7 + %c_lowerbound = load i64, ptr %c_lowerbound_addr, align 8 + %c_stride_addr = getelementptr inbounds { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %0, i64 0, i32 7, i64 0, i64 2 + %c_stride = load i64, ptr %c_stride_addr, align 8 + br label %loop.body + +loop.body: ; preds = %loop.preheader, %loop.body + %indvars.iv = phi i64 [ 1, %loop.preheader ], [ %indvars.iv.next, %loop.body ] + %6 = add nsw i64 %indvars.iv, -1 + %7 = mul i64 %a_stride, %6 + %8 = getelementptr i8, ptr %a_addr, i64 %7 + %9 = load float, ptr %8, align 4 + %10 = mul i64 %b_stride, %6 + %11 = getelementptr i8, ptr %b_addr, i64 %10 + %12 = load float, ptr %11, align 4 + %13 = fadd contract float %9, %12 + %14 = sub i64 %indvars.iv, %c_lowerbound + %15 = mul i64 %14, %c_stride + %16 = getelementptr i8, ptr %c_addr, i64 %15 + store float %13, ptr %16, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv, %5 + br i1 %exitcond.not, label %loop.exit, label %loop.body + +loop.exit: ; preds = %entry, %loop.body + ret void +} + +;subroutine test2(c, a, b, N) +; integer :: N +; integer(1) :: a(:), b(:) +; integer(1), pointer :: c(:) +; +; do i = 1, N +; c(i) = a(i) + b(i) +; end do +;end + +define void @test2_(ptr %0, ptr %1, ptr %2, ptr %N) { +; CHECK-LABEL: @test2_( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[LOOP_PREHEADER:%.*]], label [[LOOP_EXIT:%.*]] +; CHECK: loop.preheader: +; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[A_ADDR:%.*]] = load ptr, ptr [[TMP1:%.*]], align 8 +; CHECK-NEXT: [[A_ADDR4:%.*]] = ptrtoint ptr [[A_ADDR]] to i64 +; CHECK-NEXT: [[A_STRIDE_ADDR:%.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr [[TMP1]], i64 0, i32 7, i64 0, i64 2 +; CHECK-NEXT: [[A_STRIDE:%.*]] = load i64, ptr [[A_STRIDE_ADDR]], align 8 +; CHECK-NEXT: [[B_ADDR:%.*]] = load ptr, ptr [[TMP2:%.*]], align 8 +; CHECK-NEXT: [[B_ADDR5:%.*]] = ptrtoint ptr [[B_ADDR]] to i64 +; CHECK-NEXT: [[B_STRIDE_ADDR:%.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr [[TMP2]], i64 0, i32 7, i64 0, i64 2 +; CHECK-NEXT: [[B_STRIDE:%.*]] = load i64, ptr [[B_STRIDE_ADDR]], align 8 +; CHECK-NEXT: [[C_ADDR:%.*]] = load ptr, ptr [[TMP0:%.*]], align 8 +; CHECK-NEXT: [[C_ADDR3:%.*]] = ptrtoint ptr [[C_ADDR]] to i64 +; CHECK-NEXT: [[C_LOWERBOUND_ADDR:%.*]] = getelementptr inbounds { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr [[TMP0]], i64 0, i32 7 +; CHECK-NEXT: [[C_LOWERBOUND:%.*]] = load i64, ptr [[C_LOWERBOUND_ADDR]], align 8 +; CHECK-NEXT: [[C_STRIDE_ADDR:%.*]] = getelementptr inbounds { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr [[TMP0]], i64 0, i32 7, i64 0, i64 2 +; CHECK-NEXT: [[C_STRIDE:%.*]] = load i64, ptr [[C_STRIDE_ADDR]], align 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP5]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i64 [[C_STRIDE]], 1 +; CHECK-NEXT: [[IDENT_CHECK1:%.*]] = icmp ne i64 [[A_STRIDE]], 1 +; CHECK-NEXT: [[IDENT_CHECK2:%.*]] = icmp ne i64 [[B_STRIDE]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK1]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[IDENT_CHECK2]] +; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[C_ADDR3]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = sub i64 [[TMP8]], [[C_LOWERBOUND]] +; CHECK-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], [[A_ADDR4]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP10]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP9]], [[B_ADDR5]] +; CHECK-NEXT: [[DIFF_CHECK6:%.*]] = icmp ult i64 [[TMP11]], 4 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK6]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP5]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 1, [[N_VEC]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP12]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[A_STRIDE]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[A_ADDR]], i64 [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[B_STRIDE]], [[TMP13]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[B_ADDR]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP18]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP19]], align 1 +; CHECK-NEXT: [[TMP20:%.*]] = add <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD7]] +; CHECK-NEXT: [[TMP21:%.*]] = sub i64 [[TMP12]], [[C_LOWERBOUND]] +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], [[C_STRIDE]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[C_ADDR]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0 +; CHECK-NEXT: store <4 x i8> [[TMP20]], ptr [[TMP24]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[LOOP_PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ], [ 1, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[LOOP_BODY:%.*]] +; CHECK: loop.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP_BODY]] ] +; CHECK-NEXT: [[TMP26:%.*]] = add nsw i64 [[INDVARS_IV]], -1 +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[A_STRIDE]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[A_ADDR]], i64 [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[B_STRIDE]], [[TMP26]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[B_ADDR]], i64 [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = load i8, ptr [[TMP31]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = add i8 [[TMP29]], [[TMP32]] +; CHECK-NEXT: [[TMP34:%.*]] = sub i64 [[INDVARS_IV]], [[C_LOWERBOUND]] +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], [[C_STRIDE]] +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[C_ADDR]], i64 [[TMP35]] +; CHECK-NEXT: store i8 [[TMP33]], ptr [[TMP36]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP5]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[LOOP_EXIT_LOOPEXIT]], label [[LOOP_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: loop.exit.loopexit: +; CHECK-NEXT: br label [[LOOP_EXIT]] +; CHECK: loop.exit: +; CHECK-NEXT: ret void +; +entry: + %3 = load i32, ptr %N, align 4 + %4 = icmp sgt i32 %3, 0 + br i1 %4, label %loop.preheader, label %loop.exit + +loop.preheader: ; preds = %entry + %5 = zext i32 %3 to i64 + %a_addr = load ptr, ptr %1, align 8 + %a_stride_addr = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %1, i64 0, i32 7, i64 0, i64 2 + %a_stride = load i64, ptr %a_stride_addr, align 8 + %b_addr = load ptr, ptr %2, align 8 + %b_stride_addr = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %2, i64 0, i32 7, i64 0, i64 2 + %b_stride = load i64, ptr %b_stride_addr, align 8 + %c_addr = load ptr, ptr %0, align 8 + %c_lowerbound_addr = getelementptr inbounds { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %0, i64 0, i32 7 + %c_lowerbound = load i64, ptr %c_lowerbound_addr, align 8 + %c_stride_addr = getelementptr inbounds { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %0, i64 0, i32 7, i64 0, i64 2 + %c_stride = load i64, ptr %c_stride_addr, align 8 + br label %loop.body + +loop.body: ; preds = %loop.preheader, %loop.body + %indvars.iv = phi i64 [ 1, %loop.preheader ], [ %indvars.iv.next, %loop.body ] + %6 = add nsw i64 %indvars.iv, -1 + %7 = mul i64 %a_stride, %6 + %8 = getelementptr i8, ptr %a_addr, i64 %7 + %9 = load i8, ptr %8, align 1 + %10 = mul i64 %b_stride, %6 + %11 = getelementptr i8, ptr %b_addr, i64 %10 + %12 = load i8, ptr %11, align 1 + %13 = add i8 %9, %12 + %14 = sub i64 %indvars.iv, %c_lowerbound + %15 = mul i64 %14, %c_stride + %16 = getelementptr i8, ptr %c_addr, i64 %15 + store i8 %13, ptr %16, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv, %5 + br i1 %exitcond.not, label %loop.exit, label %loop.body + +loop.exit: ; preds = %entry, %loop.body + ret void +} +