Index: llvm/docs/LangRef.rst =================================================================== --- llvm/docs/LangRef.rst +++ llvm/docs/LangRef.rst @@ -7130,6 +7130,33 @@ not found to interact with the environment in an observable way, the loop may be removed. This corresponds to the ``mustprogress`` function attribute. +'``llvm.loop.prefetch``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``llvm.loop.prefetch`` metadata can be attached to ``load``/``store`` instruction. +It can indicate whether to prefetch the data accessed by this memory instruction. +And it can indicate which cache level to prefetch data to and the number of loop +iteration ahead of which a prefetch is issued. + +.. code-block:: llvm + + %val = load i32, ptr %arrayidx, !llvm.loop.prefetch !0 + %val0 = load i32, ptr %arrayidx1, !llvm.loop.prefetch !1 + ... + !0 = distinct !{i1 false, i32 -1, i32 -1} + !1 = distinct !{i1 true, i32 1, i32 128} + +If the first value of this metadata is false, it means that the data accessed by +the instruction will not be prefetched. In this case, the latter two values do +not work. + +If the first value of this metadata is true, it means that the data accessed by +the instruction needs to be prefetched. The second value indicates which +cache level the prefetched data is placed in (-1: unspecified, 0: no cache, +1: L1 cache, 2: L2 cache, 3: L3 cache). And the third value indicates which +the number of loop iteration ahead of which a prefetch is issued, before the +corresponding ``load``/``store`` instruction. + '``irr_loop``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^ Index: llvm/include/llvm/IR/FixedMetadataKinds.def =================================================================== --- llvm/include/llvm/IR/FixedMetadataKinds.def +++ llvm/include/llvm/IR/FixedMetadataKinds.def @@ -50,3 +50,4 @@ LLVM_FIXED_MD_KIND(MD_kcfi_type, "kcfi_type", 36) LLVM_FIXED_MD_KIND(MD_pcsections, "pcsections", 37) LLVM_FIXED_MD_KIND(MD_DIAssignID, "DIAssignID", 38) +LLVM_FIXED_MD_KIND(MD_loop_prefetch, "llvm.loop.prefetch", 39) Index: llvm/lib/Analysis/VectorUtils.cpp =================================================================== --- llvm/lib/Analysis/VectorUtils.cpp +++ llvm/lib/Analysis/VectorUtils.cpp @@ -896,7 +896,7 @@ for (auto Kind : {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, LLVMContext::MD_noalias, LLVMContext::MD_fpmath, LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load, - LLVMContext::MD_access_group}) { + LLVMContext::MD_access_group, LLVMContext::MD_loop_prefetch}) { MDNode *MD = I0->getMetadata(Kind); for (int J = 1, E = VL.size(); MD && J != E; ++J) { @@ -920,6 +920,9 @@ case LLVMContext::MD_access_group: MD = intersectAccessGroups(Inst, IJ); break; + case LLVMContext::MD_loop_prefetch: + // No need to handle. + break; default: llvm_unreachable("unhandled metadata"); } Index: llvm/lib/Transforms/Scalar/LICM.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LICM.cpp +++ llvm/lib/Transforms/Scalar/LICM.cpp @@ -1716,6 +1716,12 @@ << ore::NV("Inst", &I); }); + // The llvm.loop.prefetch metadata is associated with the loop where the + // instruction is located. Once it is hoisted, the metadata will become + // invalid. Remove it. + if (I.hasMetadata(LLVMContext::MD_loop_prefetch)) + I.setMetadata(LLVMContext::MD_loop_prefetch, nullptr); + // Metadata can be dependent on conditions we are hoisting above. // Conservatively strip all metadata on the instruction unless we were // guaranteed to execute I if we entered the loop, in which case the metadata Index: llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -240,9 +240,16 @@ bool Writes = false; /// The (first seen) prefetched instruction. Instruction *MemI = nullptr; - - /// Constructor to create a new Prefetch for \p I. - Prefetch(const SCEVAddRecExpr *L, Instruction *I) : LSCEVAddRec(L) { + /// Iterations ahead of each prefetch instruction. + unsigned ItersAhead; + /// Locality of each prefetch instruction. + unsigned Locality; + + /// Constructor to create a new Prefetch with ItersAhead and Locality info for + /// \p I. + Prefetch(const SCEVAddRecExpr *L, Instruction *I, unsigned ItersAhead, + unsigned Locality = 3) + : LSCEVAddRec(L), ItersAhead(ItersAhead), Locality(Locality) { addInstruction(I); }; @@ -337,6 +344,48 @@ PtrValue = SMemI->getPointerOperand(); } else continue; + // For \p Locality, there are four levels. + // '0': no locality + // '1': L3 cache + // '2': L2 cache + // '3': L1 cache + unsigned Locality = 3; + const int LOCALITY_MAX = 4; + unsigned ItersAheadCustomized = ItersAhead; + if (auto MD = I.getMetadata(LLVMContext::MD_loop_prefetch)) { + // Handle #pragma clang loop noprefetch() + bool PrefetchDisabled = + !mdconst::extract(MD->getOperand(0))->getSExtValue(); + if (PrefetchDisabled) + continue; + + // Handle #pragma clang loop prefetch() + int LocalityPrag = + mdconst::extract(MD->getOperand(1))->getSExtValue(); + int ItersAheadPrag = + mdconst::extract(MD->getOperand(2))->getSExtValue(); + switch (LocalityPrag) { + case -1: + Locality = 3; + break; + case 0: + Locality = 0; + break; + default: + Locality = LOCALITY_MAX - LocalityPrag; + break; + } + + ItersAheadCustomized = + ItersAheadPrag == -1 ? ItersAhead : ItersAheadPrag; + // If ItersAhead == 0, there is no need for prefetching. + if (!ItersAheadCustomized || + ItersAheadCustomized > getMaxPrefetchIterationsAhead() || + (ConstantMaxTripCount && + ConstantMaxTripCount < ItersAheadCustomized + 1)) + continue; + } + unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace)) continue; @@ -367,16 +416,19 @@ } } if (!DupPref) - Prefetches.push_back(Prefetch(LSCEVAddRec, MemI)); + Prefetches.push_back( + Prefetch(LSCEVAddRec, MemI, ItersAheadCustomized, Locality)); } unsigned TargetMinStride = getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, Prefetches.size(), HasCall); - LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead - << " iterations ahead (loop size: " << LoopSize << ") in " - << L->getHeader()->getParent()->getName() << ": " << *L); + for (auto &P : Prefetches) { + LLVM_DEBUG(dbgs() << "Prefetching " << P.ItersAhead + << " iterations ahead (loop size: " << LoopSize << ") in " + << L->getHeader()->getParent()->getName() << ": " << *L); + } LLVM_DEBUG(dbgs() << "Loop has: " << NumMemAccesses << " memory accesses, " << NumStridedMemAccesses << " strided memory accesses, " @@ -392,9 +444,10 @@ BasicBlock *BB = P.InsertPt->getParent(); SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr"); - const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr( - SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), - P.LSCEVAddRec->getStepRecurrence(*SE))); + const SCEV *NextLSCEV = SE->getAddExpr( + P.LSCEVAddRec, + SE->getMulExpr(SE->getConstant(P.LSCEVAddRec->getType(), P.ItersAhead), + P.LSCEVAddRec->getStepRecurrence(*SE))); if (!SCEVE.isSafeToExpand(NextLSCEV)) continue; @@ -407,11 +460,10 @@ Type *I32 = Type::getInt32Ty(BB->getContext()); Function *PrefetchFunc = Intrinsic::getDeclaration( M, Intrinsic::prefetch, PrefPtrValue->getType()); - Builder.CreateCall( - PrefetchFunc, - {PrefPtrValue, - ConstantInt::get(I32, P.Writes), - ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + Builder.CreateCall(PrefetchFunc, + {PrefPtrValue, ConstantInt::get(I32, P.Writes), + ConstantInt::get(I32, P.Locality), + ConstantInt::get(I32, 1)}); ++NumPrefetches; LLVM_DEBUG(dbgs() << " Access: " << *P.MemI->getOperand(isa(P.MemI) ? 0 : 1) Index: llvm/test/Transforms/LICM/hoist-load-with-prefetch.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LICM/hoist-load-with-prefetch.ll @@ -0,0 +1,55 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=licm -S | FileCheck %s + +define dso_local void @test(ptr %a, i32 %n, ptr noalias %b) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 4 +; CHECK-NEXT: [[CMP5:%.*]] = icmp slt i32 0, [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP5]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.for.cond.cleanup_crit_edge: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_06]] to i64 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_06]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]] +; +entry: + %arrayidx = getelementptr inbounds i32, ptr %b, i64 4 + %cmp5 = icmp slt i32 0, %n + br i1 %cmp5, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.06 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %0 = load i32, ptr %arrayidx, align 4, !llvm.loop.prefetch !0 + %idxprom = zext i32 %i.06 to i64 + %arrayidx1 = getelementptr inbounds i32, ptr %a, i64 %idxprom + %1 = load i32, ptr %arrayidx1, align 4 + %add = add nsw i32 %1, %0 + store i32 %add, ptr %arrayidx1, align 4 + %inc = add nuw nsw i32 %i.06, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge +} + +!0 = distinct !{i1 true, i32 -1, i32 -1} Index: llvm/test/Transforms/LoopDataPrefetch/pragma-loop-prefetch-noprefetch.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopDataPrefetch/pragma-loop-prefetch-noprefetch.ll @@ -0,0 +1,356 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +;RUN: opt < %s -passes=loop-data-prefetch -prefetch-distance=1000 -cache-line-size=64 -S | FileCheck %s + +define dso_local void @noprefetch_test(ptr %a, i32 %n, ptr %sum) { +; CHECK-LABEL: @noprefetch_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !0 +; CHECK-NEXT: [[ADD]] = fadd contract double [[TMP1]], [[TMP0]] +; CHECK-NEXT: store double [[ADD]], ptr [[SUM]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; + entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %.pre = load double, ptr %sum, align 8 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv + %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !0 + %add = fadd contract double %1, %0 + store double %add, ptr %sum, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @prefetch_one_arg_test(ptr %a, i32 %n, ptr %sum) { +; CHECK-LABEL: @prefetch_one_arg_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1136 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !1 +; CHECK-NEXT: [[ADD]] = fadd contract double [[TMP3]], [[TMP0]] +; CHECK-NEXT: store double [[ADD]], ptr [[SUM]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %.pre = load double, ptr %sum, align 8 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv + %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !1 + %add = fadd contract double %1, %0 + store double %add, ptr %sum, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @prefetch_two_args_test(ptr %a, i32 %n, ptr %sum) { +; CHECK-LABEL: @prefetch_two_args_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1136 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 0, i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !2 +; CHECK-NEXT: [[ADD]] = fadd contract double [[TMP3]], [[TMP0]] +; CHECK-NEXT: store double [[ADD]], ptr [[SUM]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %.pre = load double, ptr %sum, align 8 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv + %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !2 + %add = fadd contract double %1, %0 + store double %add, ptr %sum, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @prefetch_two_args_test1(ptr %a, i32 %n, ptr %sum) { +; CHECK-LABEL: @prefetch_two_args_test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1136 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !3 +; CHECK-NEXT: [[ADD]] = fadd contract double [[TMP3]], [[TMP0]] +; CHECK-NEXT: store double [[ADD]], ptr [[SUM]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %.pre = load double, ptr %sum, align 8 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv + %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !3 + %add = fadd contract double %1, %0 + store double %add, ptr %sum, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @prefetch_two_args_test2(ptr %a, i32 %n, ptr %sum) { +; CHECK-LABEL: @prefetch_two_args_test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1136 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 2, i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !4 +; CHECK-NEXT: [[ADD]] = fadd contract double [[TMP3]], [[TMP0]] +; CHECK-NEXT: store double [[ADD]], ptr [[SUM]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %.pre = load double, ptr %sum, align 8 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv + %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !4 + %add = fadd contract double %1, %0 + store double %add, ptr %sum, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @prefetch_two_args_test3(ptr %a, i32 %n, ptr %sum) { +; CHECK-LABEL: @prefetch_two_args_test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1136 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 1, i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !5 +; CHECK-NEXT: [[ADD]] = fadd contract double [[TMP3]], [[TMP0]] +; CHECK-NEXT: store double [[ADD]], ptr [[SUM]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %.pre = load double, double* %sum, align 8 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv + %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !5 + %add = fadd contract double %1, %0 + store double %add, ptr %sum, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define dso_local void @prefetch_three_args_test(ptr %a, i32 %n, ptr %sum) { +; CHECK-LABEL: @prefetch_three_args_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP4:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP4]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[DOTPRE:%.*]] = load double, ptr [[SUM:%.*]], align 8 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi double [ [[DOTPRE]], [[FOR_BODY_PREHEADER]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 80 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 1, i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !6 +; CHECK-NEXT: [[ADD]] = fadd contract double [[TMP3]], [[TMP0]] +; CHECK-NEXT: store double [[ADD]], ptr [[SUM]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] +; +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %.pre = load double, ptr %sum, align 8 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %0 = phi double [ %.pre, %for.body.preheader ], [ %add, %for.body ] + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv + %1 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !6 + %add = fadd contract double %1, %0 + store double %add, ptr %sum, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +!0 = distinct !{i1 false, i32 -1, i32 -1} +!1 = distinct !{i1 true, i32 -1, i32 -1} +!2 = distinct !{i1 true, i32 0, i32 -1} +!3 = distinct !{i1 true, i32 1, i32 -1} +!4 = distinct !{i1 true, i32 2, i32 -1} +!5 = distinct !{i1 true, i32 3, i32 -1} +!6 = distinct !{i1 true, i32 3, i32 10} Index: llvm/test/Transforms/LoopDataPrefetch/pragma-loop-prefetch-vectorize.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopDataPrefetch/pragma-loop-prefetch-vectorize.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes='loop-vectorize,loop-data-prefetch' -prefetch-distance=1000 -cache-line-size=64 -S | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define dso_local void @prefetch_vectorize(ptr %a, i32 %n, ptr %b) { +; CHECK-LABEL: @prefetch_vectorize( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 3 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[UGLYGEP1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[UGLYGEP]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDVAR7:%.*]] = phi i64 [ [[INDVAR_NEXT8:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[INDVAR7]], 5 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 2432 +; CHECK-NEXT: [[UGLYGEP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[INDVAR7]], 5 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 320 +; CHECK-NEXT: [[UGLYGEP9:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP7]], i32 0 +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP9]], i32 0, i32 2, i32 1) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP9]], align 8, !alias.scope !0, !llvm.loop.prefetch !3 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP7]], i32 2 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x double>, ptr [[TMP10]], align 8, !alias.scope !0, !llvm.loop.prefetch !3 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i32 0 +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP10]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr [[TMP13]], align 8, !alias.scope !4, !noalias !0 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i32 2 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr [[TMP14]], align 8, !alias.scope !4, !noalias !0 +; CHECK-NEXT: [[TMP15:%.*]] = fadd contract <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP16:%.*]] = fadd contract <2 x double> [[WIDE_LOAD2]], [[WIDE_LOAD4]] +; CHECK-NEXT: store <2 x double> [[TMP15]], ptr [[TMP13]], align 8, !alias.scope !4, !noalias !0 +; CHECK-NEXT: store <2 x double> [[TMP16]], ptr [[TMP14]], align 8, !alias.scope !4, !noalias !0 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[INDVAR_NEXT8]] = add i64 [[INDVAR7]], 1 +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[TMP18:%.*]] = shl nsw i64 [[BC_RESUME_VAL]], 3 +; CHECK-NEXT: [[TMP19:%.*]] = add nsw i64 [[TMP18]], 80 +; CHECK-NEXT: [[TMP20:%.*]] = shl nsw i64 [[BC_RESUME_VAL]], 3 +; CHECK-NEXT: [[TMP21:%.*]] = add nsw i64 [[TMP20]], 1136 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP22:%.*]] = shl i64 [[INDVAR]], 3 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[TMP21]], [[TMP22]] +; CHECK-NEXT: [[UGLYGEP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP24:%.*]] = shl i64 [[INDVAR]], 3 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[TMP19]], [[TMP24]] +; CHECK-NEXT: [[UGLYGEP5:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP25]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP5]], i32 0, i32 2, i32 1) +; CHECK-NEXT: [[TMP26:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !llvm.loop.prefetch !3 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP6]], i32 0, i32 3, i32 1) +; CHECK-NEXT: [[TMP27:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = fadd contract double [[TMP26]], [[TMP27]] +; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double, ptr %a, i64 %indvars.iv + %0 = load double, ptr %arrayidx, align 8, !llvm.loop.prefetch !0 + %arrayidx2 = getelementptr inbounds double, ptr %b, i64 %indvars.iv + %1 = load double, ptr %arrayidx2, align 8 + %add = fadd contract double %0, %1 + store double %add, ptr %arrayidx2, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +!0 = distinct !{i1 true, i32 2, i32 10}