diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -130,6 +130,11 @@ cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true)); +static cl::opt MaxForkedSCEVDepth( + "max-forked-scev-depth", cl::Hidden, + cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"), + cl::init(5)); + bool VectorizerParams::isInterleaveForced() { return ::VectorizationInterleave.getNumOccurrences() > 0; } @@ -778,6 +783,142 @@ } } +// Walk back through the IR for a pointer, looking for a select like the +// following: +// +// %offset = select i1 %cmp, i64 %a, i64 %b +// %addr = getelementptr double, double* %base, i64 %offset +// %ld = load double, double* %addr, align 8 +// +// We won't be able to form a single SCEVAddRecExpr from this since the +// address for each loop iteration depends on %cmp. We could potentially +// produce multiple valid SCEVAddRecExprs, though, and check all of them for +// memory safety/aliasing if needed. +// +// If we encounter some IR we don't yet handle, or something obviously fine +// like a constant, then we just add the SCEV for that term to the list passed +// in by the caller. If we have a node that may potentially yield a valid +// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms +// ourselves before adding to the list. +static void +findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr, + SmallVectorImpl> &ScevList, + unsigned Depth) { + // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or + // we've exceeded our limit on recursion, just return whatever we have + // regardless of whether it can be used for a forked pointer or not, along + // with an indication of whether it might be a poison or undef value. + const SCEV *Scev = SE->getSCEV(Ptr); + if (isa(Scev) || L->isLoopInvariant(Ptr) || + !isa(Ptr) || Depth == 0) { + ScevList.push_back( + std::make_pair(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr))); + return; + } + + Depth--; + + auto UndefPoisonCheck = [](std::pair S) -> bool { + return S.second; + }; + + Instruction *I = cast(Ptr); + unsigned Opcode = I->getOpcode(); + switch (Opcode) { + case Instruction::GetElementPtr: { + GetElementPtrInst *GEP = cast(I); + Type *SourceTy = GEP->getSourceElementType(); + // We only handle base + single offset GEPs here for now. + // Not dealing with preexisting gathers yet, so no vectors. + if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) { + ScevList.push_back( + std::make_pair(Scev, !isGuaranteedNotToBeUndefOrPoison(GEP))); + break; + } + SmallVector, 2> BaseScevs; + SmallVector, 2> OffsetScevs; + findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth); + findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth); + + // See if we need to freeze our fork... + bool NeedsFreeze = any_of(BaseScevs, UndefPoisonCheck) || + any_of(OffsetScevs, UndefPoisonCheck); + + // Check that we only have a single fork, on either the base or the offset. + // Copy the SCEV across for the one without a fork in order to generate + // the full SCEV for both sides of the GEP. + if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) + BaseScevs.push_back(BaseScevs[0]); + else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) + OffsetScevs.push_back(OffsetScevs[0]); + else { + ScevList.push_back(std::make_pair(Scev, NeedsFreeze)); + break; + } + + // Find the pointer type we need to extend to. + Type *IntPtrTy = SE->getEffectiveSCEVType( + SE->getSCEV(GEP->getPointerOperand())->getType()); + + // Find the size of the type being pointed to. We only have a single + // index term (guarded above) so we don't need to index into arrays or + // structures, just get the size of the scalar value. + const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy); + + // Scale up the offsets by the size of the type, then add to the bases. + const SCEV *Scaled1 = SE->getMulExpr( + Size, SE->getTruncateOrSignExtend(OffsetScevs[0].first, IntPtrTy)); + const SCEV *Scaled2 = SE->getMulExpr( + Size, SE->getTruncateOrSignExtend(OffsetScevs[1].first, IntPtrTy)); + ScevList.push_back(std::make_pair( + SE->getAddExpr(BaseScevs[0].first, Scaled1), NeedsFreeze)); + ScevList.push_back(std::make_pair( + SE->getAddExpr(BaseScevs[1].first, Scaled2), NeedsFreeze)); + break; + } + case Instruction::Select: { + SmallVector, 2> ChildScevs; + // A select means we've found a forked pointer, but we currently only + // support a single select per pointer so if there's another behind this + // then we just bail out and return the generic SCEV. + findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth); + findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth); + if (ChildScevs.size() == 2) { + ScevList.push_back(ChildScevs[0]); + ScevList.push_back(ChildScevs[1]); + } else + ScevList.push_back( + std::make_pair(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr))); + break; + } + default: + // Just return the current SCEV if we haven't handled the instruction yet. + LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n"); + ScevList.push_back( + std::make_pair(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr))); + break; + } + + return; +} + +static SmallVector> +findForkedPointer(PredicatedScalarEvolution &PSE, + const ValueToValueMap &StridesMap, Value *Ptr, + const Loop *L) { + ScalarEvolution *SE = PSE.getSE(); + assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!"); + SmallVector, 2> Scevs; + findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth); + + // For now, we will only accept a forked pointer with two possible SCEVs. + if (Scevs.size() == 2) + return std::move(Scevs); + + return { + std::make_pair(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false)}; +} + bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy, const ValueToValueMap &StridesMap, @@ -787,19 +928,8 @@ bool Assume) { Value *Ptr = Access.getPointer(); - ScalarEvolution &SE = *PSE.getSE(); - SmallVector> TranslatedPtrs; - auto *SI = dyn_cast(Ptr); - // Look through selects in the current loop. - if (SI && !TheLoop->isLoopInvariant(SI)) { - TranslatedPtrs = { - std::make_pair(SE.getSCEV(SI->getOperand(1)), - !isGuaranteedNotToBeUndefOrPoison(SI->getOperand(1))), - std::make_pair(SE.getSCEV(SI->getOperand(2)), - !isGuaranteedNotToBeUndefOrPoison(SI->getOperand(2)))}; - } else - TranslatedPtrs = { - std::make_pair(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false)}; + SmallVector> TranslatedPtrs = + findForkedPointer(PSE, StridesMap, Ptr, TheLoop); for (auto &P : TranslatedPtrs) { const SCEV *PtrExpr = P.first; diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll --- a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll @@ -1,4 +1,5 @@ ; RUN: opt -disable-output -passes='print-access-info' %s 2>&1 | FileCheck %s +; RUN: opt -disable-output -passes='print-access-info' -max-forked-scev-depth=2 %s 2>&1 | FileCheck -check-prefix=RECURSE %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -59,17 +60,59 @@ } ; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset': -; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: cannot identify array bounds -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[G1:.+]]): +; CHECK-NEXT: %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G2:.+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[G1]]): +; CHECK-NEXT: %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G3:.+]]): +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[G1]]): +; CHECK-NEXT: %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G4:.+]]): +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[G1]]: +; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) +; CHECK-NEXT: Member: {%Dest,+,4}<%for.body> +; CHECK-NEXT: Group [[G2]]: +; CHECK-NEXT: (Low: %Preds High: (400 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group [[G3]]: +; CHECK-NEXT: (Low: %Base2 High: (400 + %Base2)) +; CHECK-NEXT: Member: {%Base2,+,4}<%for.body> +; CHECK-NEXT: Group [[G4]]: +; CHECK-NEXT: (Low: %Base1 High: (400 + %Base1)) +; CHECK-NEXT: Member: {%Base1,+,4}<%for.body> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: ; CHECK-EMPTY: ; CHECK-NEXT: Expressions re-written: +;; We have a limit on the recursion depth for finding a loop invariant or +;; addrec term; confirm we won't exceed that depth by forcing a lower +;; limit via -max-forked-scev-depth=2 +; RECURSE-LABEL: Loop access info in function 'forked_ptrs_same_base_different_offset': +; RECURSE-NEXT: for.body: +; RECURSE-NEXT: Report: cannot identify array bounds +; RECURSE-NEXT: Dependences: +; RECURSE-NEXT: Run-time memory checks: +; RECURSE-NEXT: Grouped accesses: +; RECURSE-EMPTY: +; RECURSE-NEXT: Non vectorizable stores to invariant address were not found in loop. +; RECURSE-NEXT: SCEV assumptions: +; RECURSE-EMPTY: +; RECURSE-NEXT: Expressions re-written: + ;;;; Derived from the following C code ;; void forked_ptrs_different_base_same_offset(float *A, float *B, float *C, int *D) { ;; for (int i=0; i<100; i++) { @@ -104,11 +147,38 @@ } ; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset_possible_poison': -; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: cannot identify array bounds -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[G1:.+]]): +; CHECK-NEXT: %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G2:.+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[G1]]): +; CHECK-NEXT: %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G3:.+]]): +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[G1]]): +; CHECK-NEXT: %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G4:.+]]): +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[G1]]: +; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) +; CHECK-NEXT: Member: {%Dest,+,4}<%for.body> +; CHECK-NEXT: Group [[G2]]: +; CHECK-NEXT: (Low: %Preds High: (400 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group [[G3]]: +; CHECK-NEXT: (Low: %Base2 High: (400 + %Base2)) +; CHECK-NEXT: Member: {%Base2,+,4}<%for.body> +; CHECK-NEXT: Group [[G4]]: +; CHECK-NEXT: (Low: %Base1 High: (400 + %Base1)) +; CHECK-NEXT: Member: {%Base1,+,4}<%for.body> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: diff --git a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll --- a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll +++ b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll @@ -17,22 +17,82 @@ define dso_local void @forked_ptrs_different_base_same_offset(float* nocapture readonly %Base1, float* nocapture readonly %Base2, float* nocapture %Dest, i32* nocapture readonly %Preds) { ; CHECK-LABEL: @forked_ptrs_different_base_same_offset( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[BASE1_FR:%.*]] = freeze float* [[BASE1:%.*]] +; CHECK-NEXT: [[BASE2_FR:%.*]] = freeze float* [[BASE2:%.*]] +; CHECK-NEXT: [[DEST_FR:%.*]] = freeze float* [[DEST:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[DEST1:%.*]] = ptrtoint float* [[DEST_FR]] to i64 +; CHECK-NEXT: [[PREDS2:%.*]] = ptrtoint i32* [[PREDS:%.*]] to i64 +; CHECK-NEXT: [[BASE23:%.*]] = ptrtoint float* [[BASE2_FR]] to i64 +; CHECK-NEXT: [[BASE15:%.*]] = ptrtoint float* [[BASE1_FR]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[DEST1]], [[PREDS2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[DEST1]], [[BASE23]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DEST1]], [[BASE15]] +; CHECK-NEXT: [[DIFF_CHECK7:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[CONFLICT_RDX8:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK7]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX8]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float*> poison, float* [[BASE2_FR]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x float*> poison, float* [[BASE1_FR]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT9]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x float*> [[BROADCAST_SPLAT]], <4 x float*> [[BROADCAST_SPLAT10]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float*> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float*> [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float*> [[TMP9]], i64 2 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float*> [[TMP9]], i64 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP16]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[TMP17]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i64 1 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP20]], i64 2 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP21]], i64 3 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[DEST_FR]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP25]], <4 x float>* [[TMP27]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2:%.*]], float* [[BASE1:%.*]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP29]], 0 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2_FR]], float* [[BASE1_FR]] ; CHECK-NEXT: [[DOTSINK_IN:%.*]] = getelementptr inbounds float, float* [[SPEC_SELECT]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[DOTSINK:%.*]] = load float, float* [[DOTSINK_IN]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[DOTSINK]], float* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[DEST_FR]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[DOTSINK]], float* [[TMP30]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; entry: br label %for.body