diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -130,6 +130,11 @@ cl::desc("Enable conflict detection in loop-access analysis"), cl::init(true)); +static cl::opt MaxForkedSCEVDepth( + "max-forked-scev-depth", cl::Hidden, + cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"), + cl::init(5)); + bool VectorizerParams::isInterleaveForced() { return ::VectorizationInterleave.getNumOccurrences() > 0; } @@ -772,6 +777,126 @@ } } +// Walk back through the IR for a pointer, looking for a select like the +// following: +// +// %offset = select i1 %cmp, i64 %a, i64 %b +// %addr = getelementptr double, double* %base, i64 %offset +// %ld = load double, double* %addr, align 8 +// +// We won't be able to form a single SCEVAddRecExpr from this since the +// address for each loop iteration depends on %cmp. We could potentially +// produce multiple valid SCEVAddRecExprs, though, and check all of them for +// memory safety/aliasing if needed. +// +// If we encounter some IR we don't yet handle, or something obviously fine +// like a constant, then we just add the SCEV for that term to the list passed +// in by the caller. If we have a node that may potentially yield a valid +// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms +// ourselves before adding to the list. +static void findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr, + SmallVectorImpl &ScevList, + unsigned Depth) { + // If our Value is loop invariant or a SCEVAddRecExpr, we already have + // a usable value. If it's not an instruction or we've exceeded our limit + // on recursion, just return whatever we have regardless of whether it can + // be used for a forked pointer or not. + const SCEV *Scev = SE->getSCEV(Ptr); + if (SE->isLoopInvariant(Scev, L) || isa(Scev) || + !isa(Ptr) || Depth == 0) { + ScevList.push_back(Scev); + return; + } + + Depth--; + + Instruction *I = cast(Ptr); + unsigned Opcode = I->getOpcode(); + switch (Opcode) { + case Instruction::GetElementPtr: { + GetElementPtrInst *GEP = cast(I); + Type *SourceTy = GEP->getSourceElementType(); + // We only handle base + single offset GEPs here for now. + // Not dealing with preexisting gathers yet, so no vectors. + if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) { + ScevList.push_back(Scev); + break; + } + SmallVector BaseScevs; + SmallVector OffsetScevs; + findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth); + findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth); + + // Check that we only have a single fork, on either the base or the offset. + // Copy the SCEV across for the one without a fork in order to generate + // the full SCEV for both sides of the GEP. + if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) + BaseScevs.push_back(BaseScevs[0]); + else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) + OffsetScevs.push_back(OffsetScevs[0]); + else { + ScevList.push_back(Scev); + break; + } + + // Find the pointer type we need to extend to. + Type *IntPtrTy = SE->getEffectiveSCEVType( + SE->getSCEV(GEP->getPointerOperand())->getType()); + + // Find the size of the type being pointed to. We only have a single + // index term (guarded above) so we don't need to index into arrays or + // structures, just get the size of the scalar value. + const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy); + + // Scale up the offsets by the size of the type, then add to the bases. + const SCEV *Scaled1 = SE->getMulExpr( + Size, SE->getTruncateOrSignExtend(OffsetScevs[0], IntPtrTy)); + const SCEV *Scaled2 = SE->getMulExpr( + Size, SE->getTruncateOrSignExtend(OffsetScevs[1], IntPtrTy)); + ScevList.push_back(SE->getAddExpr(BaseScevs[0], Scaled1)); + ScevList.push_back(SE->getAddExpr(BaseScevs[1], Scaled2)); + break; + } + case Instruction::Select: { + SmallVector ChildScevs; + // A select means we've found a forked pointer, but we currently only + // support a single select per pointer so if there's another behind this + // then we just bail out and return the generic SCEV. + findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth); + findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth); + if (ChildScevs.size() == 2) { + ScevList.push_back(ChildScevs[0]); + ScevList.push_back(ChildScevs[1]); + } else + ScevList.push_back(Scev); + break; + } + default: + // Just return the current SCEV if we haven't handled the instruction yet. + LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n"); + ScevList.push_back(Scev); + break; + } + + return; +} + +static SmallVector +findForkedPointer(PredicatedScalarEvolution &PSE, + const ValueToValueMap &StridesMap, Value *Ptr, + const Loop *L) { + ScalarEvolution *SE = PSE.getSE(); + assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!"); + SmallVector Scevs; + findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth); + + // For now, we will only accept a forked pointer with two options. + if (Scevs.size() == 2) + return std::move(Scevs); + + return {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr)}; +} + bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy, const ValueToValueMap &StridesMap, @@ -781,13 +906,8 @@ bool Assume) { Value *Ptr = Access.getPointer(); - ScalarEvolution &SE = *PSE.getSE(); - SmallVector TranslatedPtrs; - if (auto *SI = dyn_cast(Ptr)) - TranslatedPtrs = {SE.getSCEV(SI->getOperand(1)), - SE.getSCEV(SI->getOperand(2))}; - else - TranslatedPtrs = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr)}; + SmallVector TranslatedPtrs = + findForkedPointer(PSE, StridesMap, Ptr, TheLoop); for (const SCEV *PtrExpr : TranslatedPtrs) { if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume)) diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll --- a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll @@ -1,4 +1,5 @@ -; RUN: opt -disable-output -passes='print-access-info' %s 2>&1 | FileCheck %s +; RUN: opt -disable-output -opaque-pointers -passes='print-access-info' %s 2>&1 | FileCheck %s +; RUN: opt -disable-output -opaque-pointers -passes='print-access-info' -max-forked-scev-depth=2 %s 2>&1 | FileCheck -check-prefix=RECURSE %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -9,16 +10,16 @@ ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Check 0: ; CHECK-NEXT: Comparing group ([[G1:.+]]): -; CHECK-NEXT: %gep.Dest = getelementptr inbounds float, float* %Dest, i64 %iv -; CHECK-NEXT: %gep.Dest = getelementptr inbounds float, float* %Dest, i64 %iv +; CHECK-NEXT: %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv +; CHECK-NEXT: %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv ; CHECK-NEXT: Against group ([[G2:.+]]): -; CHECK-NEXT: %select = select i1 %cmp, float* %gep.1, float* %gep.2 +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.1, ptr %gep.2 ; CHECK-NEXT: Check 1: ; CHECK-NEXT: Comparing group ([[G1]]): -; CHECK-NEXT: %gep.Dest = getelementptr inbounds float, float* %Dest, i64 %iv -; CHECK-NEXT: %gep.Dest = getelementptr inbounds float, float* %Dest, i64 %iv +; CHECK-NEXT: %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv +; CHECK-NEXT: %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv ; CHECK-NEXT: Against group ([[G3:.+]]): -; CHECK-NEXT: %select = select i1 %cmp, float* %gep.1, float* %gep.2 +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.1, ptr %gep.2 ; CHECK-NEXT: Grouped accesses: ; CHECK-NEXT: Group [[G1]] ; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) @@ -58,18 +59,59 @@ ret void } - ; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset': -; CHECK-NEXT: for.body: -; CHECK-NEXT: Report: cannot identify array bounds -; CHECK-NEXT: Dependences: -; CHECK-NEXT: Run-time memory checks: -; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: for.body: +; CHECK-NEXT: Memory dependences are safe with run-time checks +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[G1:.+]]): +; CHECK-NEXT: %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G2:.+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[G1]]): +; CHECK-NEXT: %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G3:.+]]): +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[G1]]): +; CHECK-NEXT: %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv +; CHECK-NEXT: Against group ([[G4:.+]]): +; CHECK-NEXT: %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[G1]]: +; CHECK-NEXT: (Low: %Dest High: (400 + %Dest)) +; CHECK-NEXT: Member: {%Dest,+,4}<%for.body> +; CHECK-NEXT: Group [[G2]]: +; CHECK-NEXT: (Low: %Preds High: (400 + %Preds)) +; CHECK-NEXT: Member: {%Preds,+,4}<%for.body> +; CHECK-NEXT: Group [[G3]]: +; CHECK-NEXT: (Low: %Base2 High: (400 + %Base2)) +; CHECK-NEXT: Member: {%Base2,+,4}<%for.body> +; CHECK-NEXT: Group [[G4]]: +; CHECK-NEXT: (Low: %Base1 High: (400 + %Base1)) +; CHECK-NEXT: Member: {%Base1,+,4}<%for.body> ; CHECK-EMPTY: -; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. -; CHECK-NEXT: SCEV assumptions: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: ; CHECK-EMPTY: -; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: Expressions re-written: + +;; We have a limit on the recursion depth for finding a loop invariant or +;; addrec term; confirm we won't exceed that depth by forcing a lower +;; limit via -max-forked-scev-depth=2 +; RECURSE-LABEL: Loop access info in function 'forked_ptrs_same_base_different_offset': +; RECURSE-NEXT: for.body: +; RECURSE-NEXT: Report: cannot identify array bounds +; RECURSE-NEXT: Dependences: +; RECURSE-NEXT: Run-time memory checks: +; RECURSE-NEXT: Grouped accesses: +; RECURSE-EMPTY: +; RECURSE-NEXT: Non vectorizable stores to invariant address were not found in loop. +; RECURSE-NEXT: SCEV assumptions: +; RECURSE-EMPTY: +; RECURSE-NEXT: Expressions re-written: ;;;; Derived from the following C code ;; void forked_ptrs_different_base_same_offset(float *A, float *B, float *C, int *D) { @@ -237,3 +279,150 @@ %exitcond.not = icmp eq i64 %indvars.iv.next, 100 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } + +;; We don't currently handle a fork in both the base and the offset of a +;; GEP instruction. + +; CHECK-LABEL: Loop access info in function 'forked_ptrs_two_forks_gep': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: cannot identify array bounds +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: + +define dso_local void @forked_ptrs_two_forks_gep(float* nocapture readonly %Base1, float* nocapture readonly %Base2, float* nocapture %Dest, i32* nocapture readonly %Preds) { +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %cmp1.not = icmp eq i32 %0, 0 + %spec.select = select i1 %cmp1.not, float* %Base2, float* %Base1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %offset = select i1 %cmp1.not, i64 %indvars.iv.next, i64 %indvars.iv + %.sink.in = getelementptr inbounds float, float* %spec.select, i64 %offset + %.sink = load float, float* %.sink.in, align 4 + %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv + store float %.sink, float* %1, align 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +;; We don't handle forks as children of a select + +; CHECK-LABEL: Loop access info in function 'forked_ptrs_two_select': +; CHECK-NEXT: loop: +; CHECK-NEXT: Report: cannot identify array bounds +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: + +define void @forked_ptrs_two_select(float* nocapture readonly %Base1, float* nocapture readonly %Base2, float* nocapture readonly %Base3, float* %Dest) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.Dest = getelementptr inbounds float, float* %Dest, i64 %iv + %l.Dest = load float, float* %gep.Dest + %cmp = fcmp une float %l.Dest, 0.0 + %cmp1 = fcmp une float %l.Dest, 1.0 + %gep.1 = getelementptr inbounds float, float* %Base1, i64 %iv + %gep.2 = getelementptr inbounds float, float* %Base2, i64 %iv + %gep.3 = getelementptr inbounds float, float* %Base3, i64 %iv + %select = select i1 %cmp, float* %gep.1, float* %gep.2 + %select1 = select i1 %cmp1, float* %select, float* %gep.3 + %sink = load float, float* %select1, align 4 + store float %sink, float* %gep.Dest, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 100 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +;; We don't yet handle geps with more than 2 operands +; CHECK-LABEL: Loop access info in function 'forked_ptrs_too_many_gep_ops': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: cannot identify array bounds +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: + +define void @forked_ptrs_too_many_gep_ops(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, float* nocapture %Dest, i32* nocapture readonly %Preds) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %Preds, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %cmp1.not = icmp eq i32 %0, 0 + %spec.select = select i1 %cmp1.not, ptr %Base2, ptr %Base1 + %.sink.in = getelementptr inbounds [1000 x float], ptr %spec.select, i64 0, i64 %indvars.iv + %.sink = load float, ptr %.sink.in, align 4 + %1 = getelementptr inbounds float, float* %Dest, i64 %indvars.iv + store float %.sink, float* %1, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} + +;; We don't currently handle vector GEPs +; CHECK-LABEL: Loop access info in function 'forked_ptrs_vector_gep': +; CHECK-NEXT: for.body: +; CHECK-NEXT: Report: cannot identify array bounds +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: + +define void @forked_ptrs_vector_gep(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) { +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %cmp1.not = icmp eq i32 %0, 0 + %spec.select = select i1 %cmp1.not, ptr %Base2, ptr %Base1 + %.sink.in = getelementptr inbounds <4 x float>, ptr %spec.select, i64 %indvars.iv + %.sink = load <4 x float>, ptr %.sink.in, align 4 + %1 = getelementptr inbounds <4 x float>, ptr %Dest, i64 %indvars.iv + store <4 x float> %.sink, ptr %1, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4 + %exitcond.not = icmp eq i64 %indvars.iv.next, 100 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret void +} \ No newline at end of file diff --git a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll --- a/llvm/test/Transforms/LoopVectorize/forked-pointers.ll +++ b/llvm/test/Transforms/LoopVectorize/forked-pointers.ll @@ -17,22 +17,79 @@ define dso_local void @forked_ptrs_different_base_same_offset(float* nocapture readonly %Base1, float* nocapture readonly %Base2, float* nocapture %Dest, i32* nocapture readonly %Preds) { ; CHECK-LABEL: @forked_ptrs_different_base_same_offset( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[DEST1:%.*]] = ptrtoint float* [[DEST:%.*]] to i64 +; CHECK-NEXT: [[PREDS2:%.*]] = ptrtoint i32* [[PREDS:%.*]] to i64 +; CHECK-NEXT: [[BASE23:%.*]] = ptrtoint float* [[BASE2:%.*]] to i64 +; CHECK-NEXT: [[BASE15:%.*]] = ptrtoint float* [[BASE1:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[DEST1]], [[PREDS2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[DEST1]], [[BASE23]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[DEST1]], [[BASE15]] +; CHECK-NEXT: [[DIFF_CHECK6:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[CONFLICT_RDX7:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK6]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float*> poison, float* [[BASE2]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x float*> poison, float* [[BASE1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x float*> [[BROADCAST_SPLATINSERT8]], <4 x float*> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x float*> [[BROADCAST_SPLAT]], <4 x float*> [[BROADCAST_SPLAT9]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float*> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float*> [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float*> [[TMP9]], i64 2 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float*> [[TMP9]], i64 3 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP16]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[TMP15]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[TMP17]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP19]], i64 1 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP20]], i64 2 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP21]], i64 3 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP25]], <4 x float>* [[TMP27]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2:%.*]], float* [[BASE1:%.*]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PREDS]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP29]], 0 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP1_NOT]], float* [[BASE2]], float* [[BASE1]] ; CHECK-NEXT: [[DOTSINK_IN:%.*]] = getelementptr inbounds float, float* [[SPEC_SELECT]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[DOTSINK:%.*]] = load float, float* [[DOTSINK_IN]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store float [[DOTSINK]], float* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[DEST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[DOTSINK]], float* [[TMP30]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; entry: br label %for.body