Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -210,7 +210,7 @@ /// In same cases when the dependency check fails we can still /// vectorize the loop with a dynamic array access check. bool shouldRetryWithRuntimeCheck() const { - return FoundNonConstantDistanceDependence && + return FoundUncomputableDistanceDependence && Status == VectorizationSafetyStatus::PossiblySafeWithRtChecks; } @@ -283,7 +283,7 @@ /// If we see a non-constant dependence distance we can still try to /// vectorize this loop with runtime checks. - bool FoundNonConstantDistanceDependence = false; + bool FoundUncomputableDistanceDependence = false; /// Result of the dependence checks, indicating whether the checked /// dependences are safe for vectorization, require RT checks or are known to Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -445,7 +445,7 @@ // // The above case requires that we have an UnknownDependence between // accesses to the same underlying object. This cannot happen unless - // FoundNonConstantDistanceDependence is set, and therefore UseDependencies + // FoundUncomputableDistanceDependence is set, and therefore UseDependencies // is also false. In this case we will use the fallback path and create // separate checking groups for all pointers. @@ -663,7 +663,7 @@ /// perform dependency checking. /// /// Note that this can later be cleared if we retry memcheck analysis without - /// dependency checking (i.e. FoundNonConstantDistanceDependence). + /// dependency checking (i.e. FoundUncomputableDistanceDependence). bool isDependencyCheckNeeded() { return !CheckDeps.empty(); } /// We decided that no dependence analysis would be used. Reset the state. @@ -710,7 +710,7 @@ /// /// Note that, this is different from isDependencyCheckNeeded. When we retry /// memcheck analysis without dependency checking - /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is + /// (i.e. FoundUncomputableDistanceDependence), isDependencyCheckNeeded is /// cleared while this remains set if we have potentially dependent accesses. bool IsRTCheckAnalysisNeeded = false; @@ -1694,7 +1694,27 @@ // Need accesses with constant stride. We don't want to vectorize // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in // the address space. - if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){ + if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr) { + bool SrcInvariant = PSE.getSE()->isLoopInvariant(Src, InnermostLoop); + bool SinkInvariant = PSE.getSE()->isLoopInvariant(Sink, InnermostLoop); + + assert(!(StrideAPtr && SrcInvariant) && "Cannot be strided and invariant"); + assert(!(StrideBPtr && SinkInvariant) && "Cannot be strided and invariant"); + + bool SrcAffine = StrideAPtr; + if (!SrcAffine && !SrcInvariant && isa(Src) && + cast(Src)->isAffine()) + SrcAffine = true; + + bool SinkAffine = StrideBPtr; + if (!SinkAffine && !SinkInvariant && isa(Sink) && + cast(Sink)->isAffine()) + SinkAffine = true; + + if (APtr != BPtr && (SrcAffine || SinkAffine) && + (SrcInvariant || SinkInvariant)) + FoundUncomputableDistanceDependence = true; + LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n"); return Dependence::Unknown; } @@ -1713,7 +1733,7 @@ return Dependence::NoDep; LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n"); - FoundNonConstantDistanceDependence = true; + FoundUncomputableDistanceDependence = true; return Dependence::Unknown; } Index: llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll =================================================================== --- llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll +++ llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll @@ -126,12 +126,14 @@ define i32 @load_with_pointer_phi_outside_loop(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: 'load_with_pointer_phi_outside_loop' ; CHECK-NEXT: loop.header: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe with run-time checks ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %v8 = load double, double* %ptr, align 8 -> -; CHECK-NEXT: store double %mul16, double* %arrayidx, align 8 +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ({{.*}}): +; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv +; CHECK-NEXT: Against group ({{.*}}): +; CHECK-NEXT: %ptr = phi double* [ %A, %if.then ], [ %ptr.select, %if.else ] ; entry: br i1 %c.0, label %if.then, label %if.else @@ -164,12 +166,14 @@ define i32 @store_with_pointer_phi_outside_loop(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: 'store_with_pointer_phi_outside_loop' ; CHECK-NEXT: loop.header: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe with run-time checks ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %v8 = load double, double* %arrayidx, align 8 -> -; CHECK-NEXT: store double %mul16, double* %ptr, align 8 +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ({{.*}}): +; CHECK-NEXT: %ptr = phi double* [ %A, %if.then ], [ %ptr.select, %if.else ] +; CHECK-NEXT: Against group ({{.*}}): +; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv ; entry: br i1 %c.0, label %if.then, label %if.else @@ -202,13 +206,8 @@ define i32 @store_with_pointer_phi_incoming_phi(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: 'store_with_pointer_phi_incoming_phi' ; CHECK-NEXT: loop.header: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe with run-time checks ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %v8 = load double, double* %arrayidx, align 8 -> -; CHECK-NEXT: store double %mul16, double* %ptr.2, align 8 -; CHECK-EMPTY: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Check 0: ; CHECK-NEXT: Comparing group ([[GROUP_C:.+]]): @@ -219,14 +218,27 @@ ; CHECK-NEXT: Comparing group ([[GROUP_C]]): ; CHECK-NEXT: double* %C ; CHECK-NEXT: Against group ([[GROUP_A:.+]]): -; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv ; CHECK-NEXT: double* %A ; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[GROUP_C]]): +; CHECK-NEXT: double* %C +; CHECK-NEXT: Against group ([[GROUP_A1:.+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv +; CHECK-NEXT: Check 3: ; CHECK-NEXT: Comparing group ([[GROUP_B]]): ; CHECK-NEXT: double* %B ; CHECK-NEXT: Against group ([[GROUP_A]]): +; CHECK-NEXT: double* %A +; CHECK-NEXT: Check 4: +; CHECK-NEXT: Comparing group ([[GROUP_B]]): +; CHECK-NEXT: double* %B +; CHECK-NEXT: Against group ([[GROUP_A1]]): ; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv +; CHECK-NEXT: Check 5: +; CHECK-NEXT: Comparing group ([[GROUP_A]]): ; CHECK-NEXT: double* %A +; CHECK-NEXT: Against group ([[GROUP_A1]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv ; CHECK-NEXT: Grouped accesses: ; CHECK-NEXT: Group [[GROUP_C]]: ; CHECK-NEXT: (Low: %C High: (8 + %C)) @@ -235,9 +247,11 @@ ; CHECK-NEXT: (Low: %B High: (8 + %B)) ; CHECK-NEXT: Member: %B ; CHECK-NEXT: Group [[GROUP_A]]: +; CHECK-NEXT: (Low: %A High: (8 + %A)) +; CHECK-NEXT: Member: %A +; CHECK-NEXT: Group [[GROUP_A1]]: ; CHECK-NEXT: (Low: %A High: (256000 + %A)) ; CHECK-NEXT: Member: {%A,+,8}<%loop.header> -; CHECK-NEXT: Member: %A ; CHECK-EMPTY entry: br label %loop.header @@ -279,13 +293,8 @@ define i32 @store_with_pointer_phi_incoming_phi_irreducible_cycle(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: 'store_with_pointer_phi_incoming_phi_irreducible_cycle' ; CHECK-NEXT: loop.header: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe with run-time checks ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %v8 = load double, double* %arrayidx, align 8 -> -; CHECK-NEXT: store double %mul16, double* %ptr.3, align 8 -; CHECK-EMPTY: ; CHECK-NEXT: Run-time memory checks: ; CHECK-NEXT: Check 0: ; CHECK-NEXT: Comparing group ([[GROUP_C:.+]]): @@ -296,25 +305,40 @@ ; CHECK-NEXT: Comparing group ([[GROUP_C]]): ; CHECK-NEXT: double* %C ; CHECK-NEXT: Against group ([[GROUP_A:.+]]): -; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv ; CHECK-NEXT: double* %A ; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[GROUP_C]]): +; CHECK-NEXT: double* %C +; CHECK-NEXT: Against group ([[GROUP_A1:.+]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv +; CHECK-NEXT: Check 3: ; CHECK-NEXT: Comparing group ([[GROUP_B]]): ; CHECK-NEXT: double* %B ; CHECK-NEXT: Against group ([[GROUP_A]]): +; CHECK-NEXT: double* %A +; CHECK-NEXT: Check 4: +; CHECK-NEXT: Comparing group ([[GROUP_B]]): +; CHECK-NEXT: double* %B +; CHECK-NEXT: Against group ([[GROUP_A1]]): ; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv +; CHECK-NEXT: Check 5: +; CHECK-NEXT: Comparing group ([[GROUP_A]]): ; CHECK-NEXT: double* %A +; CHECK-NEXT: Against group ([[GROUP_A1]]): +; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv ; CHECK-NEXT: Grouped accesses: -; CHECK-NEXT: Group [[GROUP_C]] +; CHECK-NEXT: Group [[GROUP_C]]: ; CHECK-NEXT: (Low: %C High: (8 + %C)) ; CHECK-NEXT: Member: %C -; CHECK-NEXT: Group [[GROUP_B]] +; CHECK-NEXT: Group [[GROUP_B]]: ; CHECK-NEXT: (Low: %B High: (8 + %B)) ; CHECK-NEXT: Member: %B -; CHECK-NEXT: Group [[GROUP_A]] +; CHECK-NEXT: Group [[GROUP_A]]: +; CHECK-NEXT: (Low: %A High: (8 + %A)) +; CHECK-NEXT: Member: %A +; CHECK-NEXT: Group [[GROUP_A1]]: ; CHECK-NEXT: (Low: %A High: (256000 + %A)) ; CHECK-NEXT: Member: {%A,+,8}<%loop.header> -; CHECK-NEXT: Member: %A ; CHECK-EMPTY entry: br label %loop.header @@ -351,13 +375,22 @@ define i32 @store_with_pointer_phi_outside_loop_select(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: 'store_with_pointer_phi_outside_loop_select' ; CHECK-NEXT: loop.header: -; CHECK-NEXT: Report: unsafe dependent memory operations in loop. -; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Memory dependences are safe with run-time checks ; CHECK-NEXT: Dependences: -; CHECK-NEXT: Unknown: -; CHECK-NEXT: %v8 = load double, double* %arrayidx, align 8 -> -; CHECK-NEXT: store double %mul16, double* %ptr, align 8 -; +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ({{.*}}): +; CHECK-NEXT: %ptr = phi double* [ %A, %if.then ], [ %ptr.select, %if.else ] +; CHECK-NEXT: Against group ({{.*}}): +; CHECK-NEXT: %arrayidx = getelementptr inbounds double, double* %A, i64 %iv +; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group {{.*}}: +; CHECK-NEXT: (Low: %ptr High: (8 + %ptr)) +; CHECK-NEXT: Member: %ptr +; CHECK-NEXT: Group {{.*}}: +; CHECK-NEXT: (Low: %A High: (256000 + %A)) +; CHECK-NEXT: Member: {%A,+,8}<%loop.header> +; CHECK-EMPTY entry: br i1 %c.0, label %if.then, label %if.else Index: llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll =================================================================== --- llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll +++ llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll @@ -9,20 +9,20 @@ define void @f(i32* noalias %a, i32* noalias %b, i32* noalias %c, i32* noalias %d, i32* noalias %e, i64 %N) { ; CHECK-LABEL: @f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A5:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[A2:%.*]] = bitcast i32* [[A:%.*]] to i8* ; CHECK-NEXT: br label [[FOR_BODY_LVER_CHECK:%.*]] ; CHECK: for.body.lver.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], -1 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 -; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP0]]) -; CHECK-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = sub i64 0, [[MUL_RESULT3]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[A5]], i64 [[MUL_RESULT3]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp ult i8* [[TMP12]], [[A5]] -; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW4]] -; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP7]], [[TMP17]] -; CHECK-NEXT: br i1 [[TMP18]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 +; CHECK-NEXT: [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP0]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[A2]], i64 [[MUL_RESULT]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i8* [[TMP3]], [[A2]] +; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP1]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]] ; CHECK: for.body.ph.lver.orig: ; CHECK-NEXT: br label [[FOR_BODY_LVER_ORIG:%.*]] ; CHECK: for.body.lver.orig: @@ -147,16 +147,16 @@ ; CHECK-NEXT: br label [[FOR_BODY_LVER_CHECK:%.*]] ; CHECK: for.body.lver.check: ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], -1 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 -; CHECK-NEXT: [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP0]]) -; CHECK-NEXT: [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0 -; CHECK-NEXT: [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1 -; CHECK-NEXT: [[TMP11:%.*]] = sub i64 0, [[MUL_RESULT3]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* bitcast (i32* getelementptr inbounds ([8192 x i32], [8192 x i32]* @global_a, i64 0, i64 42) to i8*), i64 [[MUL_RESULT3]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp ult i8* [[TMP12]], bitcast (i32* getelementptr inbounds ([8192 x i32], [8192 x i32]* @global_a, i64 0, i64 42) to i8*) -; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[TMP15]], [[MUL_OVERFLOW4]] -; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP7]], [[TMP17]] -; CHECK-NEXT: br i1 [[TMP18]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 +; CHECK-NEXT: [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP0]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = sub i64 0, [[MUL_RESULT]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* bitcast (i32* getelementptr inbounds ([8192 x i32], [8192 x i32]* @global_a, i64 0, i64 42) to i8*), i64 [[MUL_RESULT]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i8* [[TMP3]], bitcast (i32* getelementptr inbounds ([8192 x i32], [8192 x i32]* @global_a, i64 0, i64 42) to i8*) +; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP1]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]] ; CHECK: for.body.ph.lver.orig: ; CHECK-NEXT: br label [[FOR_BODY_LVER_ORIG:%.*]] ; CHECK: for.body.lver.orig: Index: llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -63,34 +63,34 @@ ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP0]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]] -; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]] +; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDEX]] to i32 -; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP16]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[ADD_US]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4 -; CHECK-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0 -; CHECK-NEXT: store i32 [[TMP24]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1 -; CHECK-NEXT: store i32 [[TMP25]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2 -; CHECK-NEXT: store i32 [[TMP26]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3 -; CHECK-NEXT: store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 +; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[ADD_US]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0 +; CHECK-NEXT: store i32 [[TMP20]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1 +; CHECK-NEXT: store i32 [[TMP21]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2 +; CHECK-NEXT: store i32 [[TMP22]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3 +; CHECK-NEXT: store i32 [[TMP23]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] @@ -143,42 +143,96 @@ ; Same test as above, but without the invalid parallel_loop_access metadata. -; Here we can see the vectorizer does the mem dep checks and decides it is -; unsafe to vectorize. +; Here we can see the vectorizer does the mem dep checks and decides to vectorize +; with a run-time check define void @no-par-mem-metadata(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-LABEL: @no-par-mem-metadata( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = ptrtoint i32* [[A:%.*]] to i64 ; CHECK-NEXT: [[CMP27:%.*]] = icmp sgt i32 [[M:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]] ; CHECK: for.body3.lr.ph.us.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[M]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[K]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]] ; CHECK: for.end.us: ; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4 -; CHECK-NEXT: [[ADD10_US:%.*]] = add nsw i32 [[TMP0]], 3 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4 +; CHECK-NEXT: [[ADD10_US:%.*]] = add nsw i32 [[TMP5]], 3 ; CHECK-NEXT: store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32 ; CHECK-NEXT: [[EXITCOND36:%.*]] = icmp eq i32 [[LFTR_WIDEIV35]], [[M]] ; CHECK-NEXT: br i1 [[EXITCOND36]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY3_LR_PH_US]], !llvm.loop [[LOOP2]] ; CHECK: for.body3.us: -; CHECK-NEXT: [[INDVARS_IV29:%.*]] = phi i64 [ 0, [[FOR_BODY3_LR_PH_US]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDVARS_IV29]] to i32 -; CHECK-NEXT: [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP1]] +; CHECK-NEXT: [[INDVARS_IV29:%.*]] = phi i64 [ [[BC_RESUME_VAL:%.*]], [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ] +; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[INDVARS_IV29]] to i32 +; CHECK-NEXT: [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP6]] ; CHECK-NEXT: [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64 -; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4 -; CHECK-NEXT: [[ADD5_US:%.*]] = add nsw i32 [[TMP2]], 1 +; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IDXPROM_US]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4 +; CHECK-NEXT: [[ADD5_US:%.*]] = add nsw i32 [[TMP7]], 1 ; CHECK-NEXT: store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32 ; CHECK-NEXT: [[EXITCOND32:%.*]] = icmp eq i32 [[LFTR_WIDEIV31]], [[M]] -; CHECK-NEXT: br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop [[LOOP1:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: for.body3.lr.ph.us: ; CHECK-NEXT: [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ] -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVARS_IV33]] to i32 -; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP3]], [[K:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], [[INDVARS_IV33]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = shl nsw i64 [[TMP10]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[A1]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP1]], [[INDVARS_IV33]] +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 +; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[INDVARS_IV33]] to i32 +; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP15]], [[K]] ; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP0]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp slt i32 [[TMP16]], [[TMP14]] +; CHECK-NEXT: br i1 [[TMP17]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP18:%.*]] = sub i64 [[A1]], [[TMP12]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP18]], 32 +; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDEX]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 0 +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[ADD_US]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = sext i32 [[TMP21]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP26]], i32 0 +; CHECK-NEXT: store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP26]], i32 1 +; CHECK-NEXT: store i32 [[TMP28]], i32* [[ARRAYIDX7_US]], align 4 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP26]], i32 2 +; CHECK-NEXT: store i32 [[TMP29]], i32* [[ARRAYIDX7_US]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP26]], i32 3 +; CHECK-NEXT: store i32 [[TMP30]], i32* [[ARRAYIDX7_US]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ] ; CHECK-NEXT: br label [[FOR_BODY3_US]] ; CHECK: for.end15.loopexit: ; CHECK-NEXT: br label [[FOR_END15]] Index: llvm/test/Transforms/LoopVectorize/runtime-check-invariant-and-affine.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/runtime-check-invariant-and-affine.ll @@ -0,0 +1,120 @@ +; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s +%struct.f = type { %struct.c, [0 x %struct.c] } +%struct.c = type { i16, i8 } +%struct.Arrays = type { [128 x double], [128 x double], [128 x double], double } + +@h = dso_local local_unnamed_addr global i32 0, align 4 +@g = dso_local local_unnamed_addr global %struct.f zeroinitializer, align 2 + +@a = dso_local local_unnamed_addr global i32 0, align 4 +@b = dso_local global [1 x i32] zeroinitializer, align 4 + +@n = dso_local local_unnamed_addr global i32 0, align 4 +@s1 = dso_local local_unnamed_addr global %struct.Arrays zeroinitializer, align 8 + +declare double @llvm.fmuladd.f64(double, double, double) #1 + +define dso_local void @two_sides_affine() { +; CHECK-LABEL: @two_sides_affine +; CHECK-NOT:vector.memcheck: +; CHECK-NOT:vector.body: +; +entry: + %.pr = load i32, ptr @h, align 4 + %tobool.not2 = icmp eq i32 %.pr, 0 + br i1 %tobool.not2, label %for.end, label %for.body.preheader + +for.body.preheader: + %0 = sext i32 %.pr to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [0 x %struct.c], ptr getelementptr inbounds (%struct.f, ptr @g, i64 1, i32 0, i32 0), i64 0, i64 %indvars.iv + %b = getelementptr inbounds [0 x %struct.c], ptr getelementptr inbounds (%struct.f, ptr @g, i64 1, i32 0, i32 0), i64 0, i64 %indvars.iv, i32 1 + %1 = load i8, ptr %b, align 2 + %conv = zext i8 %1 to i16 + store i16 %conv, ptr %arrayidx, align 2 + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %2 = and i64 %indvars.iv.next, 4294967295 + %tobool.not = icmp eq i64 %2, 0 + br i1 %tobool.not, label %for.cond.for.end_crit_edge, label %for.body + +for.cond.for.end_crit_edge: + store i32 0, ptr @h, align 4 + br label %for.end + +for.end: + ret void +} + +define dso_local void @srcaffine_sinkinvariant() { +; CHECK-LABEL: @srcaffine_sinkinvariant +; CHECK:vector.memcheck: +; CHECK:vector.body: +; +entry: + %.pr = load i32, i32* @a, align 4 + %tobool.not2 = icmp eq i32 %.pr, 0 + br i1 %tobool.not2, label %for.end, label %for.body.preheader + +for.body.preheader: + %0 = sext i32 %.pr to i64 + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds [1 x i32], [1 x i32]* @b, i64 0, i64 %indvars.iv + %1 = load i32, i32* %arrayidx, align 4 + %tobool1.not = icmp eq i32 %1, 0 + br i1 %tobool1.not, label %for.inc, label %if.then + +if.then: + store i32 ptrtoint ([1 x i32]* @b to i32), i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i64 0, i64 0), align 4 + br label %for.inc + +for.inc: + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %2 = trunc i64 %indvars.iv.next to i32 + %tobool.not = icmp eq i32 %2, 0 + br i1 %tobool.not, label %for.cond.for.end_crit_edge, label %for.body + +for.cond.for.end_crit_edge: + store i32 0, i32* @a, align 4 + br label %for.end + +for.end: + ret void +} + +define dso_local void @srcinvariant_sinkaffine() { +; CHECK-LABEL: @srcinvariant_sinkaffine +; CHECK:vector.memcheck: +; CHECK:vector.body: +; +entry: + %0 = load i32, i32* @n, align 4 + %cmp10 = icmp sgt i32 %0, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %wide.trip.count = zext i32 %0 to i64 + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %1 = load double, double* getelementptr inbounds (%struct.Arrays, %struct.Arrays* @s1, i64 0, i32 3), align 8 + %arrayidx = getelementptr inbounds %struct.Arrays, %struct.Arrays* @s1, i64 0, i32 1, i64 %indvars.iv + %2 = load double, double* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds %struct.Arrays, %struct.Arrays* @s1, i64 0, i32 2, i64 %indvars.iv + %3 = load double, double* %arrayidx2, align 8 + %4 = tail call double @llvm.fmuladd.f64(double %2, double %3, double %1) + %arrayidx4 = getelementptr inbounds %struct.Arrays, %struct.Arrays* @s1, i64 0, i32 0, i64 %indvars.iv + store double %4, double* %arrayidx4, align 8 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} Index: llvm/test/Transforms/LoopVectorize/vectorize-pointer-phis.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/vectorize-pointer-phis.ll +++ llvm/test/Transforms/LoopVectorize/vectorize-pointer-phis.ll @@ -109,7 +109,8 @@ define i32 @load_with_pointer_phi_outside_loop(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: @load_with_pointer_phi_outside_loop -; CHECK-NOT: vector.body +; CHECK: vector.body +; CHECK: memcheck ; entry: br i1 %c.0, label %if.then, label %if.else @@ -141,7 +142,8 @@ define i32 @store_with_pointer_phi_outside_loop(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) { ; CHECK-LABEL: @store_with_pointer_phi_outside_loop -; CHECK-NOT: vector.body +; CHECK: vector.body +; CHECK: memcheck ; entry: br i1 %c.0, label %if.then, label %if.else Index: llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -92,97 +92,426 @@ ; CHECK-NEXT: [[TMP0:%.*]] = bitcast [225 x double]* [[A:%.*]] to <225 x double>* ; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>* -; CHECK-NEXT: br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US:%.*]] -; CHECK: for.cond1.preheader.us: -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[I]], 225 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]]) -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV6]] +; CHECK-NEXT: br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] +; CHECK: for.cond1.preheader.us.preheader: +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[I]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[CONV6]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[I]], 225 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV6]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I]], 6 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[SCEVGEP25:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP5]] +; CHECK-NEXT: [[SCEVGEP23:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[CONV6]] +; CHECK-NEXT: [[SCEVGEP21:%.*]] = getelementptr [225 x double], [225 x double]* [[A]], i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[SCEVGEP19:%.*]] = getelementptr [225 x double], [225 x double]* [[A]], i64 0, i64 0 +; CHECK-NEXT: [[SCEVGEP17:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP4]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 0 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult double* [[SCEVGEP]], [[SCEVGEP21]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult double* [[SCEVGEP19]], [[SCEVGEP17]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND027:%.*]] = icmp ult double* [[SCEVGEP]], [[SCEVGEP25]] +; CHECK-NEXT: [[BOUND128:%.*]] = icmp ult double* [[SCEVGEP23]], [[SCEVGEP17]] +; CHECK-NEXT: [[FOUND_CONFLICT29:%.*]] = and i1 [[BOUND027]], [[BOUND128]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT29]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY4_US_PREHEADER]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[I]], 252 +; CHECK-NEXT: [[TMP8:%.*]] = load double, double* [[TMP7]], align 8, !alias.scope !0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT31:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT32:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT31]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = or i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = or i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP11:%.*]] = or i32 [[INDEX]], 3 +; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[INDEX]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> poison, i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> [[TMP16]], i64 [[TMP13]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i64> [[TMP18]], i64 [[TMP15]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = icmp ult <2 x i64> [[TMP17]], +; CHECK-NEXT: [[TMP21:%.*]] = icmp ult <2 x i64> [[TMP19]], +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP20]], i64 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP20]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]]) +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[TMP21]], i64 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP24]]) +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP21]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP25]]) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP12]] +; CHECK-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP27]], align 8, !alias.scope !3 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP26]], i64 2 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x double>, <2 x double>* [[TMP29]], align 8, !alias.scope !3 +; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD30]], [[BROADCAST_SPLAT32]] +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP12]] +; CHECK-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <2 x double>, <2 x double>* [[TMP33]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP32]], i64 2 +; CHECK-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <2 x double>, <2 x double>* [[TMP35]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP36:%.*]] = fsub <2 x double> [[WIDE_LOAD33]], [[TMP30]] +; CHECK-NEXT: [[TMP37:%.*]] = fsub <2 x double> [[WIDE_LOAD34]], [[TMP31]] +; CHECK-NEXT: [[TMP38:%.*]] = bitcast double* [[TMP32]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP36]], <2 x double>* [[TMP38]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP34]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP37]], <2 x double>* [[TMP39]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[I]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]], label [[FOR_BODY4_US_PREHEADER]] +; CHECK: for.body4.us.preheader: +; CHECK-NEXT: [[K_013_US_PH:%.*]] = phi i32 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] ; CHECK: for.body4.us: -; CHECK-NEXT: [[K_013_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ] +; CHECK-NEXT: [[K_013_US:%.*]] = phi i32 [ [[INC_US:%.*]], [[FOR_BODY4_US]] ], [ [[K_013_US_PH]], [[FOR_BODY4_US_PREHEADER]] ] ; CHECK-NEXT: [[CONV_US:%.*]] = zext i32 [[K_013_US]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[K_013_US]], 225 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]]) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[CONV_US]] -; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, double* [[TMP5]], align 8 -; CHECK-NEXT: [[MATRIXEXT8_US:%.*]] = load double, double* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP41:%.*]] = icmp ult i32 [[K_013_US]], 225 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP41]]) +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[CONV_US]] +; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, double* [[TMP42]], align 8 +; CHECK-NEXT: [[MATRIXEXT8_US:%.*]] = load double, double* [[TMP7]], align 8 ; CHECK-NEXT: [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV_US]] -; CHECK-NEXT: [[MATRIXEXT11_US:%.*]] = load double, double* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV_US]] +; CHECK-NEXT: [[MATRIXEXT11_US:%.*]] = load double, double* [[TMP43]], align 8 ; CHECK-NEXT: [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]] -; CHECK-NEXT: store double [[SUB_US]], double* [[TMP6]], align 8 +; CHECK-NEXT: store double [[SUB_US]], double* [[TMP43]], align 8 ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[K_013_US]], 1 ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[I]] -; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] +; CHECK-NEXT: br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us: -; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[CONV6]], 15 -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i32 [[I]], 210 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP8]]) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP7]] +; CHECK-NEXT: [[TMP44:%.*]] = add nuw nsw i64 [[CONV6]], 15 +; CHECK-NEXT: [[TMP45:%.*]] = icmp ult i32 [[I]], 210 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP45]]) +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP44]] +; CHECK-NEXT: [[MIN_ITERS_CHECK_1:%.*]] = icmp ult i32 [[I]], 6 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_1]], label [[FOR_BODY4_US_PREHEADER_1:%.*]], label [[VECTOR_MEMCHECK_1:%.*]] +; CHECK: vector.memcheck.1: +; CHECK-NEXT: [[TMP47:%.*]] = add nuw nsw i64 [[CONV6]], 16 +; CHECK-NEXT: [[SCEVGEP25_1:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP47]] +; CHECK-NEXT: [[TMP48:%.*]] = add nuw nsw i64 [[CONV6]], 15 +; CHECK-NEXT: [[SCEVGEP23_1:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP48]] +; CHECK-NEXT: [[TMP49:%.*]] = add nuw nsw i64 [[TMP3]], 16 +; CHECK-NEXT: [[SCEVGEP21_1:%.*]] = getelementptr [225 x double], [225 x double]* [[A]], i64 0, i64 [[TMP49]] +; CHECK-NEXT: [[SCEVGEP19_1:%.*]] = getelementptr [225 x double], [225 x double]* [[A]], i64 0, i64 15 +; CHECK-NEXT: [[SCEVGEP17_1:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP49]] +; CHECK-NEXT: [[SCEVGEP_1:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 15 +; CHECK-NEXT: [[BOUND0_1:%.*]] = icmp ult double* [[SCEVGEP_1]], [[SCEVGEP21_1]] +; CHECK-NEXT: [[BOUND1_1:%.*]] = icmp ult double* [[SCEVGEP19_1]], [[SCEVGEP17_1]] +; CHECK-NEXT: [[FOUND_CONFLICT_1:%.*]] = and i1 [[BOUND0_1]], [[BOUND1_1]] +; CHECK-NEXT: [[BOUND027_1:%.*]] = icmp ult double* [[SCEVGEP_1]], [[SCEVGEP25_1]] +; CHECK-NEXT: [[BOUND128_1:%.*]] = icmp ult double* [[SCEVGEP23_1]], [[SCEVGEP17_1]] +; CHECK-NEXT: [[FOUND_CONFLICT29_1:%.*]] = and i1 [[BOUND027_1]], [[BOUND128_1]] +; CHECK-NEXT: [[CONFLICT_RDX_1:%.*]] = or i1 [[FOUND_CONFLICT_1]], [[FOUND_CONFLICT29_1]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX_1]], label [[FOR_BODY4_US_PREHEADER_1]], label [[VECTOR_PH_1:%.*]] +; CHECK: vector.ph.1: +; CHECK-NEXT: [[N_VEC_1:%.*]] = and i32 [[I]], 252 +; CHECK-NEXT: [[TMP50:%.*]] = load double, double* [[TMP46]], align 8, !alias.scope !0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT_1:%.*]] = insertelement <2 x double> poison, double [[TMP50]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT_1:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT_1]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT31_1:%.*]] = insertelement <2 x double> poison, double [[TMP50]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT32_1:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT31_1]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY_1:%.*]] +; CHECK: vector.body.1: +; CHECK-NEXT: [[INDEX_1:%.*]] = phi i32 [ 0, [[VECTOR_PH_1]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY_1]] ] +; CHECK-NEXT: [[TMP51:%.*]] = or i32 [[INDEX_1]], 1 +; CHECK-NEXT: [[TMP52:%.*]] = or i32 [[INDEX_1]], 2 +; CHECK-NEXT: [[TMP53:%.*]] = or i32 [[INDEX_1]], 3 +; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[INDEX_1]] to i64 +; CHECK-NEXT: [[TMP55:%.*]] = zext i32 [[TMP51]] to i64 +; CHECK-NEXT: [[TMP56:%.*]] = zext i32 [[TMP52]] to i64 +; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP53]] to i64 +; CHECK-NEXT: [[TMP58:%.*]] = add nuw nsw i64 [[TMP54]], 15 +; CHECK-NEXT: [[TMP59:%.*]] = add nuw nsw i64 [[TMP55]], 15 +; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i64> poison, i64 [[TMP58]], i64 0 +; CHECK-NEXT: [[TMP61:%.*]] = insertelement <2 x i64> [[TMP60]], i64 [[TMP59]], i64 1 +; CHECK-NEXT: [[TMP62:%.*]] = add nuw nsw i64 [[TMP56]], 15 +; CHECK-NEXT: [[TMP63:%.*]] = add nuw nsw i64 [[TMP57]], 15 +; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i64> poison, i64 [[TMP62]], i64 0 +; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i64> [[TMP64]], i64 [[TMP63]], i64 1 +; CHECK-NEXT: [[TMP66:%.*]] = icmp ult <2 x i64> [[TMP61]], +; CHECK-NEXT: [[TMP67:%.*]] = icmp ult <2 x i64> [[TMP65]], +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <2 x i1> [[TMP66]], i64 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP68]]) +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <2 x i1> [[TMP66]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP69]]) +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <2 x i1> [[TMP67]], i64 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP70]]) +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i1> [[TMP67]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP71]]) +; CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP58]] +; CHECK-NEXT: [[TMP73:%.*]] = bitcast double* [[TMP72]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x double>, <2 x double>* [[TMP73]], align 8, !alias.scope !3 +; CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds double, double* [[TMP72]], i64 2 +; CHECK-NEXT: [[TMP75:%.*]] = bitcast double* [[TMP74]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD30_1:%.*]] = load <2 x double>, <2 x double>* [[TMP75]], align 8, !alias.scope !3 +; CHECK-NEXT: [[TMP76:%.*]] = fmul <2 x double> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT_1]] +; CHECK-NEXT: [[TMP77:%.*]] = fmul <2 x double> [[WIDE_LOAD30_1]], [[BROADCAST_SPLAT32_1]] +; CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP58]] +; CHECK-NEXT: [[TMP79:%.*]] = bitcast double* [[TMP78]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD33_1:%.*]] = load <2 x double>, <2 x double>* [[TMP79]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds double, double* [[TMP78]], i64 2 +; CHECK-NEXT: [[TMP81:%.*]] = bitcast double* [[TMP80]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD34_1:%.*]] = load <2 x double>, <2 x double>* [[TMP81]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP82:%.*]] = fsub <2 x double> [[WIDE_LOAD33_1]], [[TMP76]] +; CHECK-NEXT: [[TMP83:%.*]] = fsub <2 x double> [[WIDE_LOAD34_1]], [[TMP77]] +; CHECK-NEXT: [[TMP84:%.*]] = bitcast double* [[TMP78]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP82]], <2 x double>* [[TMP84]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP85:%.*]] = bitcast double* [[TMP80]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP83]], <2 x double>* [[TMP85]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT_1]] = add nuw i32 [[INDEX_1]], 4 +; CHECK-NEXT: [[TMP86:%.*]] = icmp eq i32 [[INDEX_NEXT_1]], [[N_VEC_1]] +; CHECK-NEXT: br i1 [[TMP86]], label [[MIDDLE_BLOCK_1:%.*]], label [[VECTOR_BODY_1]], !llvm.loop [[LOOP8]] +; CHECK: middle.block.1: +; CHECK-NEXT: [[CMP_N_1:%.*]] = icmp eq i32 [[N_VEC_1]], [[I]] +; CHECK-NEXT: br i1 [[CMP_N_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]], label [[FOR_BODY4_US_PREHEADER_1]] +; CHECK: for.body4.us.preheader.1: +; CHECK-NEXT: [[K_013_US_PH_1:%.*]] = phi i32 [ 0, [[VECTOR_MEMCHECK_1]] ], [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[N_VEC_1]], [[MIDDLE_BLOCK_1]] ] ; CHECK-NEXT: br label [[FOR_BODY4_US_1:%.*]] ; CHECK: for.body4.us.1: -; CHECK-NEXT: [[K_013_US_1:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INC_US_1:%.*]], [[FOR_BODY4_US_1]] ] +; CHECK-NEXT: [[K_013_US_1:%.*]] = phi i32 [ [[INC_US_1:%.*]], [[FOR_BODY4_US_1]] ], [ [[K_013_US_PH_1]], [[FOR_BODY4_US_PREHEADER_1]] ] ; CHECK-NEXT: [[NARROW:%.*]] = add nuw nsw i32 [[K_013_US_1]], 15 -; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[NARROW]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[K_013_US_1]], 210 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP10]] -; CHECK-NEXT: [[MATRIXEXT_US_1:%.*]] = load double, double* [[TMP12]], align 8 -; CHECK-NEXT: [[MATRIXEXT8_US_1:%.*]] = load double, double* [[TMP9]], align 8 +; CHECK-NEXT: [[TMP87:%.*]] = zext i32 [[NARROW]] to i64 +; CHECK-NEXT: [[TMP88:%.*]] = icmp ult i32 [[K_013_US_1]], 210 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP88]]) +; CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP87]] +; CHECK-NEXT: [[MATRIXEXT_US_1:%.*]] = load double, double* [[TMP89]], align 8 +; CHECK-NEXT: [[MATRIXEXT8_US_1:%.*]] = load double, double* [[TMP46]], align 8 ; CHECK-NEXT: [[MUL_US_1:%.*]] = fmul double [[MATRIXEXT_US_1]], [[MATRIXEXT8_US_1]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP10]] -; CHECK-NEXT: [[MATRIXEXT11_US_1:%.*]] = load double, double* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP87]] +; CHECK-NEXT: [[MATRIXEXT11_US_1:%.*]] = load double, double* [[TMP90]], align 8 ; CHECK-NEXT: [[SUB_US_1:%.*]] = fsub double [[MATRIXEXT11_US_1]], [[MUL_US_1]] -; CHECK-NEXT: store double [[SUB_US_1]], double* [[TMP13]], align 8 +; CHECK-NEXT: store double [[SUB_US_1]], double* [[TMP90]], align 8 ; CHECK-NEXT: [[INC_US_1]] = add nuw nsw i32 [[K_013_US_1]], 1 ; CHECK-NEXT: [[CMP2_US_1:%.*]] = icmp ult i32 [[INC_US_1]], [[I]] -; CHECK-NEXT: br i1 [[CMP2_US_1]], label [[FOR_BODY4_US_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]] +; CHECK-NEXT: br i1 [[CMP2_US_1]], label [[FOR_BODY4_US_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]], !llvm.loop [[LOOP10]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.1: -; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[CONV6]], 30 -; CHECK-NEXT: [[TMP15:%.*]] = icmp ult i32 [[I]], 195 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP15]]) -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP91:%.*]] = add nuw nsw i64 [[CONV6]], 30 +; CHECK-NEXT: [[TMP92:%.*]] = icmp ult i32 [[I]], 195 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP92]]) +; CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP91]] +; CHECK-NEXT: [[MIN_ITERS_CHECK_2:%.*]] = icmp ult i32 [[I]], 6 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_2]], label [[FOR_BODY4_US_PREHEADER_2:%.*]], label [[VECTOR_MEMCHECK_2:%.*]] +; CHECK: vector.memcheck.2: +; CHECK-NEXT: [[TMP94:%.*]] = add nuw nsw i64 [[CONV6]], 31 +; CHECK-NEXT: [[SCEVGEP25_2:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP94]] +; CHECK-NEXT: [[TMP95:%.*]] = add nuw nsw i64 [[CONV6]], 30 +; CHECK-NEXT: [[SCEVGEP23_2:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP95]] +; CHECK-NEXT: [[TMP96:%.*]] = add nuw nsw i64 [[TMP3]], 31 +; CHECK-NEXT: [[SCEVGEP21_2:%.*]] = getelementptr [225 x double], [225 x double]* [[A]], i64 0, i64 [[TMP96]] +; CHECK-NEXT: [[SCEVGEP19_2:%.*]] = getelementptr [225 x double], [225 x double]* [[A]], i64 0, i64 30 +; CHECK-NEXT: [[SCEVGEP17_2:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP96]] +; CHECK-NEXT: [[SCEVGEP_2:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 30 +; CHECK-NEXT: [[BOUND0_2:%.*]] = icmp ult double* [[SCEVGEP_2]], [[SCEVGEP21_2]] +; CHECK-NEXT: [[BOUND1_2:%.*]] = icmp ult double* [[SCEVGEP19_2]], [[SCEVGEP17_2]] +; CHECK-NEXT: [[FOUND_CONFLICT_2:%.*]] = and i1 [[BOUND0_2]], [[BOUND1_2]] +; CHECK-NEXT: [[BOUND027_2:%.*]] = icmp ult double* [[SCEVGEP_2]], [[SCEVGEP25_2]] +; CHECK-NEXT: [[BOUND128_2:%.*]] = icmp ult double* [[SCEVGEP23_2]], [[SCEVGEP17_2]] +; CHECK-NEXT: [[FOUND_CONFLICT29_2:%.*]] = and i1 [[BOUND027_2]], [[BOUND128_2]] +; CHECK-NEXT: [[CONFLICT_RDX_2:%.*]] = or i1 [[FOUND_CONFLICT_2]], [[FOUND_CONFLICT29_2]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX_2]], label [[FOR_BODY4_US_PREHEADER_2]], label [[VECTOR_PH_2:%.*]] +; CHECK: vector.ph.2: +; CHECK-NEXT: [[N_VEC_2:%.*]] = and i32 [[I]], 252 +; CHECK-NEXT: [[TMP97:%.*]] = load double, double* [[TMP93]], align 8, !alias.scope !0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT_2:%.*]] = insertelement <2 x double> poison, double [[TMP97]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT_2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT_2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT31_2:%.*]] = insertelement <2 x double> poison, double [[TMP97]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT32_2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT31_2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY_2:%.*]] +; CHECK: vector.body.2: +; CHECK-NEXT: [[INDEX_2:%.*]] = phi i32 [ 0, [[VECTOR_PH_2]] ], [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY_2]] ] +; CHECK-NEXT: [[TMP98:%.*]] = or i32 [[INDEX_2]], 1 +; CHECK-NEXT: [[TMP99:%.*]] = or i32 [[INDEX_2]], 2 +; CHECK-NEXT: [[TMP100:%.*]] = or i32 [[INDEX_2]], 3 +; CHECK-NEXT: [[TMP101:%.*]] = zext i32 [[INDEX_2]] to i64 +; CHECK-NEXT: [[TMP102:%.*]] = zext i32 [[TMP98]] to i64 +; CHECK-NEXT: [[TMP103:%.*]] = zext i32 [[TMP99]] to i64 +; CHECK-NEXT: [[TMP104:%.*]] = zext i32 [[TMP100]] to i64 +; CHECK-NEXT: [[TMP105:%.*]] = add nuw nsw i64 [[TMP101]], 30 +; CHECK-NEXT: [[TMP106:%.*]] = add nuw nsw i64 [[TMP102]], 30 +; CHECK-NEXT: [[TMP107:%.*]] = insertelement <2 x i64> poison, i64 [[TMP105]], i64 0 +; CHECK-NEXT: [[TMP108:%.*]] = insertelement <2 x i64> [[TMP107]], i64 [[TMP106]], i64 1 +; CHECK-NEXT: [[TMP109:%.*]] = add nuw nsw i64 [[TMP103]], 30 +; CHECK-NEXT: [[TMP110:%.*]] = add nuw nsw i64 [[TMP104]], 30 +; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i64> poison, i64 [[TMP109]], i64 0 +; CHECK-NEXT: [[TMP112:%.*]] = insertelement <2 x i64> [[TMP111]], i64 [[TMP110]], i64 1 +; CHECK-NEXT: [[TMP113:%.*]] = icmp ult <2 x i64> [[TMP108]], +; CHECK-NEXT: [[TMP114:%.*]] = icmp ult <2 x i64> [[TMP112]], +; CHECK-NEXT: [[TMP115:%.*]] = extractelement <2 x i1> [[TMP113]], i64 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP115]]) +; CHECK-NEXT: [[TMP116:%.*]] = extractelement <2 x i1> [[TMP113]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP116]]) +; CHECK-NEXT: [[TMP117:%.*]] = extractelement <2 x i1> [[TMP114]], i64 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP117]]) +; CHECK-NEXT: [[TMP118:%.*]] = extractelement <2 x i1> [[TMP114]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP118]]) +; CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP105]] +; CHECK-NEXT: [[TMP120:%.*]] = bitcast double* [[TMP119]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x double>, <2 x double>* [[TMP120]], align 8, !alias.scope !3 +; CHECK-NEXT: [[TMP121:%.*]] = getelementptr inbounds double, double* [[TMP119]], i64 2 +; CHECK-NEXT: [[TMP122:%.*]] = bitcast double* [[TMP121]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD30_2:%.*]] = load <2 x double>, <2 x double>* [[TMP122]], align 8, !alias.scope !3 +; CHECK-NEXT: [[TMP123:%.*]] = fmul <2 x double> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT_2]] +; CHECK-NEXT: [[TMP124:%.*]] = fmul <2 x double> [[WIDE_LOAD30_2]], [[BROADCAST_SPLAT32_2]] +; CHECK-NEXT: [[TMP125:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP105]] +; CHECK-NEXT: [[TMP126:%.*]] = bitcast double* [[TMP125]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD33_2:%.*]] = load <2 x double>, <2 x double>* [[TMP126]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP127:%.*]] = getelementptr inbounds double, double* [[TMP125]], i64 2 +; CHECK-NEXT: [[TMP128:%.*]] = bitcast double* [[TMP127]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD34_2:%.*]] = load <2 x double>, <2 x double>* [[TMP128]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP129:%.*]] = fsub <2 x double> [[WIDE_LOAD33_2]], [[TMP123]] +; CHECK-NEXT: [[TMP130:%.*]] = fsub <2 x double> [[WIDE_LOAD34_2]], [[TMP124]] +; CHECK-NEXT: [[TMP131:%.*]] = bitcast double* [[TMP125]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP129]], <2 x double>* [[TMP131]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP132:%.*]] = bitcast double* [[TMP127]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP130]], <2 x double>* [[TMP132]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT_2]] = add nuw i32 [[INDEX_2]], 4 +; CHECK-NEXT: [[TMP133:%.*]] = icmp eq i32 [[INDEX_NEXT_2]], [[N_VEC_2]] +; CHECK-NEXT: br i1 [[TMP133]], label [[MIDDLE_BLOCK_2:%.*]], label [[VECTOR_BODY_2]], !llvm.loop [[LOOP8]] +; CHECK: middle.block.2: +; CHECK-NEXT: [[CMP_N_2:%.*]] = icmp eq i32 [[N_VEC_2]], [[I]] +; CHECK-NEXT: br i1 [[CMP_N_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]], label [[FOR_BODY4_US_PREHEADER_2]] +; CHECK: for.body4.us.preheader.2: +; CHECK-NEXT: [[K_013_US_PH_2:%.*]] = phi i32 [ 0, [[VECTOR_MEMCHECK_2]] ], [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[N_VEC_2]], [[MIDDLE_BLOCK_2]] ] ; CHECK-NEXT: br label [[FOR_BODY4_US_2:%.*]] ; CHECK: for.body4.us.2: -; CHECK-NEXT: [[K_013_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ] -; CHECK-NEXT: [[NARROW17:%.*]] = add nuw nsw i32 [[K_013_US_2]], 30 -; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[NARROW17]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = icmp ult i32 [[K_013_US_2]], 195 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP18]]) -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP17]] -; CHECK-NEXT: [[MATRIXEXT_US_2:%.*]] = load double, double* [[TMP19]], align 8 -; CHECK-NEXT: [[MATRIXEXT8_US_2:%.*]] = load double, double* [[TMP16]], align 8 +; CHECK-NEXT: [[K_013_US_2:%.*]] = phi i32 [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ], [ [[K_013_US_PH_2]], [[FOR_BODY4_US_PREHEADER_2]] ] +; CHECK-NEXT: [[NARROW35:%.*]] = add nuw nsw i32 [[K_013_US_2]], 30 +; CHECK-NEXT: [[TMP134:%.*]] = zext i32 [[NARROW35]] to i64 +; CHECK-NEXT: [[TMP135:%.*]] = icmp ult i32 [[K_013_US_2]], 195 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP135]]) +; CHECK-NEXT: [[TMP136:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP134]] +; CHECK-NEXT: [[MATRIXEXT_US_2:%.*]] = load double, double* [[TMP136]], align 8 +; CHECK-NEXT: [[MATRIXEXT8_US_2:%.*]] = load double, double* [[TMP93]], align 8 ; CHECK-NEXT: [[MUL_US_2:%.*]] = fmul double [[MATRIXEXT_US_2]], [[MATRIXEXT8_US_2]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP17]] -; CHECK-NEXT: [[MATRIXEXT11_US_2:%.*]] = load double, double* [[TMP20]], align 8 +; CHECK-NEXT: [[TMP137:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP134]] +; CHECK-NEXT: [[MATRIXEXT11_US_2:%.*]] = load double, double* [[TMP137]], align 8 ; CHECK-NEXT: [[SUB_US_2:%.*]] = fsub double [[MATRIXEXT11_US_2]], [[MUL_US_2]] -; CHECK-NEXT: store double [[SUB_US_2]], double* [[TMP20]], align 8 +; CHECK-NEXT: store double [[SUB_US_2]], double* [[TMP137]], align 8 ; CHECK-NEXT: [[INC_US_2]] = add nuw nsw i32 [[K_013_US_2]], 1 ; CHECK-NEXT: [[CMP2_US_2:%.*]] = icmp ult i32 [[INC_US_2]], [[I]] -; CHECK-NEXT: br i1 [[CMP2_US_2]], label [[FOR_BODY4_US_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]] +; CHECK-NEXT: br i1 [[CMP2_US_2]], label [[FOR_BODY4_US_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]], !llvm.loop [[LOOP10]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.2: -; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[CONV6]], 45 -; CHECK-NEXT: [[TMP22:%.*]] = icmp ult i32 [[I]], 180 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]]) -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP21]] +; CHECK-NEXT: [[TMP138:%.*]] = add nuw nsw i64 [[CONV6]], 45 +; CHECK-NEXT: [[TMP139:%.*]] = icmp ult i32 [[I]], 180 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP139]]) +; CHECK-NEXT: [[TMP140:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP138]] +; CHECK-NEXT: [[MIN_ITERS_CHECK_3:%.*]] = icmp ult i32 [[I]], 6 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK_3]], label [[FOR_BODY4_US_PREHEADER_3:%.*]], label [[VECTOR_MEMCHECK_3:%.*]] +; CHECK: vector.memcheck.3: +; CHECK-NEXT: [[TMP141:%.*]] = add nuw nsw i64 [[CONV6]], 46 +; CHECK-NEXT: [[SCEVGEP25_3:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP141]] +; CHECK-NEXT: [[TMP142:%.*]] = add nuw nsw i64 [[CONV6]], 45 +; CHECK-NEXT: [[SCEVGEP23_3:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP142]] +; CHECK-NEXT: [[TMP143:%.*]] = add nuw nsw i64 [[TMP3]], 46 +; CHECK-NEXT: [[SCEVGEP21_3:%.*]] = getelementptr [225 x double], [225 x double]* [[A]], i64 0, i64 [[TMP143]] +; CHECK-NEXT: [[SCEVGEP19_3:%.*]] = getelementptr [225 x double], [225 x double]* [[A]], i64 0, i64 45 +; CHECK-NEXT: [[SCEVGEP17_3:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 [[TMP143]] +; CHECK-NEXT: [[SCEVGEP_3:%.*]] = getelementptr [225 x double], [225 x double]* [[B]], i64 0, i64 45 +; CHECK-NEXT: [[BOUND0_3:%.*]] = icmp ult double* [[SCEVGEP_3]], [[SCEVGEP21_3]] +; CHECK-NEXT: [[BOUND1_3:%.*]] = icmp ult double* [[SCEVGEP19_3]], [[SCEVGEP17_3]] +; CHECK-NEXT: [[FOUND_CONFLICT_3:%.*]] = and i1 [[BOUND0_3]], [[BOUND1_3]] +; CHECK-NEXT: [[BOUND027_3:%.*]] = icmp ult double* [[SCEVGEP_3]], [[SCEVGEP25_3]] +; CHECK-NEXT: [[BOUND128_3:%.*]] = icmp ult double* [[SCEVGEP23_3]], [[SCEVGEP17_3]] +; CHECK-NEXT: [[FOUND_CONFLICT29_3:%.*]] = and i1 [[BOUND027_3]], [[BOUND128_3]] +; CHECK-NEXT: [[CONFLICT_RDX_3:%.*]] = or i1 [[FOUND_CONFLICT_3]], [[FOUND_CONFLICT29_3]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX_3]], label [[FOR_BODY4_US_PREHEADER_3]], label [[VECTOR_PH_3:%.*]] +; CHECK: vector.ph.3: +; CHECK-NEXT: [[N_VEC_3:%.*]] = and i32 [[I]], 252 +; CHECK-NEXT: [[TMP144:%.*]] = load double, double* [[TMP140]], align 8, !alias.scope !0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT_3:%.*]] = insertelement <2 x double> poison, double [[TMP144]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT_3:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT_3]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT31_3:%.*]] = insertelement <2 x double> poison, double [[TMP144]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT32_3:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT31_3]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY_3:%.*]] +; CHECK: vector.body.3: +; CHECK-NEXT: [[INDEX_3:%.*]] = phi i32 [ 0, [[VECTOR_PH_3]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY_3]] ] +; CHECK-NEXT: [[TMP145:%.*]] = or i32 [[INDEX_3]], 1 +; CHECK-NEXT: [[TMP146:%.*]] = or i32 [[INDEX_3]], 2 +; CHECK-NEXT: [[TMP147:%.*]] = or i32 [[INDEX_3]], 3 +; CHECK-NEXT: [[TMP148:%.*]] = zext i32 [[INDEX_3]] to i64 +; CHECK-NEXT: [[TMP149:%.*]] = zext i32 [[TMP145]] to i64 +; CHECK-NEXT: [[TMP150:%.*]] = zext i32 [[TMP146]] to i64 +; CHECK-NEXT: [[TMP151:%.*]] = zext i32 [[TMP147]] to i64 +; CHECK-NEXT: [[TMP152:%.*]] = add nuw nsw i64 [[TMP148]], 45 +; CHECK-NEXT: [[TMP153:%.*]] = add nuw nsw i64 [[TMP149]], 45 +; CHECK-NEXT: [[TMP154:%.*]] = insertelement <2 x i64> poison, i64 [[TMP152]], i64 0 +; CHECK-NEXT: [[TMP155:%.*]] = insertelement <2 x i64> [[TMP154]], i64 [[TMP153]], i64 1 +; CHECK-NEXT: [[TMP156:%.*]] = add nuw nsw i64 [[TMP150]], 45 +; CHECK-NEXT: [[TMP157:%.*]] = add nuw nsw i64 [[TMP151]], 45 +; CHECK-NEXT: [[TMP158:%.*]] = insertelement <2 x i64> poison, i64 [[TMP156]], i64 0 +; CHECK-NEXT: [[TMP159:%.*]] = insertelement <2 x i64> [[TMP158]], i64 [[TMP157]], i64 1 +; CHECK-NEXT: [[TMP160:%.*]] = icmp ult <2 x i64> [[TMP155]], +; CHECK-NEXT: [[TMP161:%.*]] = icmp ult <2 x i64> [[TMP159]], +; CHECK-NEXT: [[TMP162:%.*]] = extractelement <2 x i1> [[TMP160]], i64 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP162]]) +; CHECK-NEXT: [[TMP163:%.*]] = extractelement <2 x i1> [[TMP160]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP163]]) +; CHECK-NEXT: [[TMP164:%.*]] = extractelement <2 x i1> [[TMP161]], i64 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP164]]) +; CHECK-NEXT: [[TMP165:%.*]] = extractelement <2 x i1> [[TMP161]], i64 1 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP165]]) +; CHECK-NEXT: [[TMP166:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP152]] +; CHECK-NEXT: [[TMP167:%.*]] = bitcast double* [[TMP166]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x double>, <2 x double>* [[TMP167]], align 8, !alias.scope !3 +; CHECK-NEXT: [[TMP168:%.*]] = getelementptr inbounds double, double* [[TMP166]], i64 2 +; CHECK-NEXT: [[TMP169:%.*]] = bitcast double* [[TMP168]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD30_3:%.*]] = load <2 x double>, <2 x double>* [[TMP169]], align 8, !alias.scope !3 +; CHECK-NEXT: [[TMP170:%.*]] = fmul <2 x double> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT_3]] +; CHECK-NEXT: [[TMP171:%.*]] = fmul <2 x double> [[WIDE_LOAD30_3]], [[BROADCAST_SPLAT32_3]] +; CHECK-NEXT: [[TMP172:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP152]] +; CHECK-NEXT: [[TMP173:%.*]] = bitcast double* [[TMP172]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD33_3:%.*]] = load <2 x double>, <2 x double>* [[TMP173]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP174:%.*]] = getelementptr inbounds double, double* [[TMP172]], i64 2 +; CHECK-NEXT: [[TMP175:%.*]] = bitcast double* [[TMP174]] to <2 x double>* +; CHECK-NEXT: [[WIDE_LOAD34_3:%.*]] = load <2 x double>, <2 x double>* [[TMP175]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP176:%.*]] = fsub <2 x double> [[WIDE_LOAD33_3]], [[TMP170]] +; CHECK-NEXT: [[TMP177:%.*]] = fsub <2 x double> [[WIDE_LOAD34_3]], [[TMP171]] +; CHECK-NEXT: [[TMP178:%.*]] = bitcast double* [[TMP172]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP176]], <2 x double>* [[TMP178]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[TMP179:%.*]] = bitcast double* [[TMP174]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP177]], <2 x double>* [[TMP179]], align 8, !alias.scope !5, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT_3]] = add nuw i32 [[INDEX_3]], 4 +; CHECK-NEXT: [[TMP180:%.*]] = icmp eq i32 [[INDEX_NEXT_3]], [[N_VEC_3]] +; CHECK-NEXT: br i1 [[TMP180]], label [[MIDDLE_BLOCK_3:%.*]], label [[VECTOR_BODY_3]], !llvm.loop [[LOOP8]] +; CHECK: middle.block.3: +; CHECK-NEXT: [[CMP_N_3:%.*]] = icmp eq i32 [[N_VEC_3]], [[I]] +; CHECK-NEXT: br i1 [[CMP_N_3]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY4_US_PREHEADER_3]] +; CHECK: for.body4.us.preheader.3: +; CHECK-NEXT: [[K_013_US_PH_3:%.*]] = phi i32 [ 0, [[VECTOR_MEMCHECK_3]] ], [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[N_VEC_3]], [[MIDDLE_BLOCK_3]] ] ; CHECK-NEXT: br label [[FOR_BODY4_US_3:%.*]] ; CHECK: for.body4.us.3: -; CHECK-NEXT: [[K_013_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ] -; CHECK-NEXT: [[NARROW18:%.*]] = add nuw nsw i32 [[K_013_US_3]], 45 -; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[NARROW18]] to i64 -; CHECK-NEXT: [[TMP25:%.*]] = icmp ult i32 [[K_013_US_3]], 180 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP25]]) -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP24]] -; CHECK-NEXT: [[MATRIXEXT_US_3:%.*]] = load double, double* [[TMP26]], align 8 -; CHECK-NEXT: [[MATRIXEXT8_US_3:%.*]] = load double, double* [[TMP23]], align 8 +; CHECK-NEXT: [[K_013_US_3:%.*]] = phi i32 [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ], [ [[K_013_US_PH_3]], [[FOR_BODY4_US_PREHEADER_3]] ] +; CHECK-NEXT: [[NARROW36:%.*]] = add nuw nsw i32 [[K_013_US_3]], 45 +; CHECK-NEXT: [[TMP181:%.*]] = zext i32 [[NARROW36]] to i64 +; CHECK-NEXT: [[TMP182:%.*]] = icmp ult i32 [[K_013_US_3]], 180 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP182]]) +; CHECK-NEXT: [[TMP183:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP181]] +; CHECK-NEXT: [[MATRIXEXT_US_3:%.*]] = load double, double* [[TMP183]], align 8 +; CHECK-NEXT: [[MATRIXEXT8_US_3:%.*]] = load double, double* [[TMP140]], align 8 ; CHECK-NEXT: [[MUL_US_3:%.*]] = fmul double [[MATRIXEXT_US_3]], [[MATRIXEXT8_US_3]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP24]] -; CHECK-NEXT: [[MATRIXEXT11_US_3:%.*]] = load double, double* [[TMP27]], align 8 +; CHECK-NEXT: [[TMP184:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[TMP181]] +; CHECK-NEXT: [[MATRIXEXT11_US_3:%.*]] = load double, double* [[TMP184]], align 8 ; CHECK-NEXT: [[SUB_US_3:%.*]] = fsub double [[MATRIXEXT11_US_3]], [[MUL_US_3]] -; CHECK-NEXT: store double [[SUB_US_3]], double* [[TMP27]], align 8 +; CHECK-NEXT: store double [[SUB_US_3]], double* [[TMP184]], align 8 ; CHECK-NEXT: [[INC_US_3]] = add nuw nsw i32 [[K_013_US_3]], 1 ; CHECK-NEXT: [[CMP2_US_3:%.*]] = icmp ult i32 [[INC_US_3]], [[I]] -; CHECK-NEXT: br i1 [[CMP2_US_3]], label [[FOR_BODY4_US_3]], label [[FOR_COND_CLEANUP]] +; CHECK-NEXT: br i1 [[CMP2_US_3]], label [[FOR_BODY4_US_3]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ;