diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1158,14 +1158,11 @@ LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B << '\n'); GroupB = createInterleaveGroup(B, DesB.Stride, DesB.Alignment); - } else if (CompletedLoadGroups.contains(GroupB)) { - // Skip B if no new instructions can be added to its load group. - continue; + if (B->mayWriteToMemory()) + StoreGroups.insert(GroupB); + else + LoadGroups.insert(GroupB); } - if (B->mayWriteToMemory()) - StoreGroups.insert(GroupB); - else - LoadGroups.insert(GroupB); } for (auto AI = std::next(BI); AI != E; ++AI) { @@ -1191,38 +1188,62 @@ // Because accesses (2) and (3) are dependent, we can group (2) with (1) // but not with (4). If we did, the dependent access (3) would be within // the boundaries of the (2, 4) group. - if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) { - // If a dependence exists and A is already in a group, we know that A - // must be a store since A precedes B and WAR dependences are allowed. - // Thus, A would be sunk below B. We release A's group to prevent this - // illegal code motion. A will then be free to form another group with - // instructions that precede it. - if (isInterleaved(A)) { - InterleaveGroup *StoreGroup = getInterleaveGroup(A); - - LLVM_DEBUG(dbgs() << "LV: Invalidated store group due to " - "dependence between " << *A << " and "<< *B << '\n'); - - StoreGroups.remove(StoreGroup); - releaseGroup(StoreGroup); - } - // If B is a load and part of an interleave group, no earlier loads can - // be added to B's interleave group, because this would mean the load B - // would need to be moved across store A. Mark the interleave group as - // complete. - if (GroupB && isa(B)) { - LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B - << " as complete.\n"); - - CompletedLoadGroups.insert(GroupB); + auto DependentMember = [&](InterleaveGroup *Group, + StrideEntry *A) -> Instruction * { + for (uint32_t Index = 0; Index < Group->getFactor(); ++Index) { + Instruction *MemberOfGroupB = Group->getMember(Index); + if (MemberOfGroupB && !canReorderMemAccessesForInterleavedGroups( + A, &*AccessStrideInfo.find(MemberOfGroupB))) + return MemberOfGroupB; } + return nullptr; + }; - // If a dependence exists and A is not already in a group (or it was - // and we just released it), B might be hoisted above A (if B is a - // load) or another store might be sunk below A (if B is a store). In - // either case, we can't add additional instructions to B's group. B - // will only form a group with instructions that it precedes. - break; + auto GroupA = getInterleaveGroup(A); + // If A is a load, dependencies are tolerable, there's nothing to do here. + // If both A and B belong to the same (store) group, they are independent, + // even if dependencies have not been recorded. + // If both GroupA and GroupB are null, there's nothing to do here. + if (A->mayWriteToMemory() && GroupA != GroupB) { + Instruction *DependentInst = nullptr; + // If GroupB is a load group, we have to compare AI against all + // members of GroupB because if any load within GroupB has a dependency + // on AI, we need to mark GroupB as complete and also release the + // store GroupA (if A belongs to one). The former prevents incorrect + // hoisting of load B above store A while the latter prevents incorrect + // sinking of store A below load B. + if (GroupB && LoadGroups.contains(GroupB)) + DependentInst = DependentMember(GroupB, &*AI); + else if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) + DependentInst = B; + + if (DependentInst) { + // A has a store dependence on B (or on some load within GroupB) and + // is part of a store group. Release A's group to prevent illegal + // sinking of A below B. A will then be free to form another group + // with instructions that precede it. + if (GroupA && StoreGroups.contains(GroupA)) { + LLVM_DEBUG(dbgs() << "LV: Invalidated store group due to " + "dependence between " + << *A << " and " << *DependentInst << '\n'); + StoreGroups.remove(GroupA); + releaseGroup(GroupA); + } + // If B is a load and part of an interleave group, no earlier loads + // can be added to B's interleave group, because this would mean the + // DependentInst would move across store A. Mark the interleave group + // as complete. + if (GroupB && LoadGroups.contains(GroupB)) { + LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B + << " as complete.\n"); + CompletedLoadGroups.insert(GroupB); + } + } + } + if (CompletedLoadGroups.contains(GroupB)) { + // Skip trying to add A to B, continue to look for other conflicting A's + // in groups to be released. + continue; } // At this point, we've checked for illegal code motion. If either A or B diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll --- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-hoist-load-across-store.ll @@ -121,7 +121,6 @@ ; compare against the obstructing stores (%l2 versus the store) there is no ; dependency. However, the other load in %l2's interleave group (%l3) does ; obstruct with the store. -; FIXME: The test case is currently mis-compiled. define void @pr63602_2(ptr %arr) { ; CHECK-LABEL: define void @pr63602_2 ; CHECK-SAME: (ptr [[ARR:%.*]]) { @@ -140,40 +139,64 @@ ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[INDEX]], 3 ; CHECK-NEXT: [[OFFSET_IDX2:%.*]] = add i64 1, [[TMP5]] ; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX2]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 -2 -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP9]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX2]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX2]], 6 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX2]], 9 +; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP12]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 0 -; CHECK-NEXT: store i32 [[TMP14]], ptr [[TMP10]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 1 -; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP11]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 2 -; CHECK-NEXT: store i32 [[TMP16]], ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC4]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 0 ; CHECK-NEXT: store i32 [[TMP17]], ptr [[TMP13]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP18]], i32 0 -; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP10]], align 4 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP18]], i32 1 -; CHECK-NEXT: store i32 [[TMP20]], ptr [[TMP11]], align 4 -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP18]], i32 2 -; CHECK-NEXT: store i32 [[TMP21]], ptr [[TMP12]], align 4 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP18]], i32 3 -; CHECK-NEXT: store i32 [[TMP22]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 1 +; CHECK-NEXT: store i32 [[TMP18]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 2 +; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[STRIDED_VEC]], i32 3 +; CHECK-NEXT: store i32 [[TMP20]], ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[TMP6]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = add nuw nsw i64 [[TMP7]], 2 +; CHECK-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[TMP8]], 2 +; CHECK-NEXT: [[TMP24:%.*]] = add nuw nsw i64 [[TMP9]], 2 +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP21]] +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP22]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP24]] +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[TMP16]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP30]], i32 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP31]], i32 2 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP32]], i32 3 +; CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[TMP25]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP26]], align 4 +; CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[TMP27]], align 4 +; CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr [[TMP28]], align 4 +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> poison, i32 [[TMP37]], i32 0 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP38]], i32 1 +; CHECK-NEXT: [[TMP43:%.*]] = insertelement <4 x i32> [[TMP42]], i32 [[TMP39]], i32 2 +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i32> [[TMP43]], i32 [[TMP40]], i32 3 +; CHECK-NEXT: [[TMP45:%.*]] = add <4 x i32> [[TMP36]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i32> [[TMP45]], i32 0 +; CHECK-NEXT: store i32 [[TMP46]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i32> [[TMP45]], i32 1 +; CHECK-NEXT: store i32 [[TMP47]], ptr [[TMP14]], align 4 +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <4 x i32> [[TMP45]], i32 2 +; CHECK-NEXT: store i32 [[TMP48]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i32> [[TMP45]], i32 3 +; CHECK-NEXT: store i32 [[TMP49]], ptr [[TMP16]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 17, 16 -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 49, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 52, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY]] ] @@ -195,7 +218,7 @@ ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP_IV_2]], align 4 ; CHECK-NEXT: [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 3 ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[IV_2]], 50 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll --- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-sink-store-across-load.ll @@ -3,15 +3,9 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2" target triple = "x86_64-apple-macos" -; This is currently miscompiled. -; %l2 load and the preceeding store has a dependency. However, we currently sink +; %l2 load and the preceeding store has a dependency. We should not sink ; that store into the last store (by creating an interleaved store group). This -; means the loaded %l2 has incorrect value. -; We do not release this store group correctly because the next interleave group -; chosen compares only the memory access of last load in program (%l3) against the dependent store location -; (%gep.iv.1.plus.2) and they are different, thereby incorrectly assuming no -; dependency. We need to compare against all loads in that interleaved group -; (%l2 is part of it). +; means the loaded %l2 will have incorrect value. define void @avoid_sinking_store_across_load(ptr %arr) { ; CHECK-LABEL: define void @avoid_sinking_store_across_load ; CHECK-SAME: (ptr [[ARR:%.*]]) #[[ATTR0:[0-9]+]] { @@ -28,26 +22,28 @@ ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 4 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 -2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[VEC_IND2]] ; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[ARR]], <4 x i64> [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[STRIDED_VEC5]], +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[STRIDED_VEC]], ; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP8]], <4 x ptr> [[TMP7]], i32 4, <4 x i1> ) -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC]] -; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP9]], <4 x ptr> [[TMP5]], i32 4, <4 x i1> ) +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_VEC4:%.*]] = load <12 x i32>, ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC4]], <12 x i32> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC4]], <12 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[STRIDED_VEC6]], [[STRIDED_VEC5]] +; CHECK-NEXT: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[TMP11]], <4 x ptr> [[TMP5]], i32 4, <4 x i1> ) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i64> [[VEC_IND2]], -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 17, 16 -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 49, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 52, [[MIDDLE_BLOCK]] ], [ 4, [[ENTRY]] ] @@ -70,7 +66,7 @@ ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP_IV_2]], align 4 ; CHECK-NEXT: [[IV_2_NEXT]] = add nuw nsw i64 [[IV_2]], 3 ; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[IV_2]], 50 -; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[ICMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-max-dependences.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-max-dependences.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-max-dependences.ll @@ -0,0 +1,28 @@ +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -max-dependences=0 -S %s | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; None of these stores have dependences between them, so we can successfully +; interleave them even though the max-dependences threshold is 0. +define void @three_interleaved_stores(ptr %arr) { +; CHECK-LABEL: define void @three_interleaved_stores +; CHECK: store <12 x i8> +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %i.next, %loop ] + %i.plus.1 = add nuw nsw i64 %i, 1 + %i.plus.2 = add nuw nsw i64 %i, 2 + %gep.i.plus.0 = getelementptr inbounds i8, ptr %arr, i64 %i + %gep.i.plus.1 = getelementptr inbounds i8, ptr %arr, i64 %i.plus.1 + %gep.i.plus.2 = getelementptr inbounds i8, ptr %arr, i64 %i.plus.2 + store i8 1, ptr %gep.i.plus.0 + store i8 1, ptr %gep.i.plus.1 + store i8 1, ptr %gep.i.plus.2 + %i.next = add nuw nsw i64 %i, 3 + %icmp = icmp ugt i64 %i, 1032 + br i1 %icmp, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-use-after-free.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-use-after-free.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-use-after-free.ll @@ -0,0 +1,95 @@ +; REQUIRES: asserts +; RUN: opt -passes=loop-vectorize -debug-only=loop-accesses,vectorutils -force-vector-width=4 -disable-output %s 2>&1 | FileCheck %s +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-redhat-linux-gnu" + +%struct.foo = type { ptr, ptr, ptr } +%struct.pluto = type <{ %struct.wombat, %struct.spam, %struct.wibble, [6 x i8] }> +%struct.wombat = type { %struct.barney } +%struct.barney = type { %struct.widget } +%struct.widget = type { %struct.hoge } +%struct.hoge = type { %struct.pluto.0 } +%struct.pluto.0 = type { %struct.foo } +%struct.spam = type { %struct.barney.1 } +%struct.barney.1 = type { %struct.ham } +%struct.ham = type { %struct.bar } +%struct.bar = type { %struct.barney.2 } +%struct.barney.2 = type { %struct.hoge.3 } +%struct.hoge.3 = type { ptr, ptr, ptr } +%struct.wibble = type { %struct.spam.4 } +%struct.spam.4 = type { [2 x %struct.zot] } +%struct.zot = type { %struct.bar.5 } +%struct.bar.5 = type { i8 } +%struct.baz = type { i64, %struct.pluto } + +; CHECK: LAA: Found a loop in test: bb4 +; CHECK: Too many dependences, stopped recording +; If no dependences are recorded because there are too many, LoopAccessAnalysis +; just conservatively returns true for any pair of instructions compared (even +; those belonging to the same store group). This tests make sure that we do not +; incorrectly release a store group which had no dependences between its +; members, even if we have no dependences recorded because there are too many. + +; CHECK: LV: Creating an interleave group with: store ptr null, ptr %phi5, align 8 +; CHECK: LV: Inserted: store ptr %load12, ptr %getelementptr11, align 8 +; CHECK: into the interleave group with store ptr null, ptr %phi5 +; CHECK: LV: Inserted: store ptr %load7, ptr %getelementptr, align 8 +; CHECK: into the interleave group with store ptr null, ptr %phi5 + +; CHECK: LV: Creating an interleave group with: store ptr null, ptr %getelementptr13, align 8 +; CHECK: LV: Inserted: store ptr null, ptr %phi6, align 8 +; CHECK: into the interleave group with store ptr null, ptr %getelementptr13 +; CHECK: LV: Invalidated store group due to dependence between store ptr %load7, ptr %getelementptr, align 8 and store ptr null, ptr %getelementptr13, align 8 +; CHECK-NOT: LV: Invalidated store group due to dependence between + +; Note: The (only) invalidated store group is the one containing A (store ptr %load7, ptr %getelementptr, align 8) which is: +; Group with instructions: +; store ptr null, ptr %phi5, align 8 +; store ptr %load7, ptr %getelementptr, align 8 +; store ptr %load12, ptr %getelementptr11, align 8 +define void @test(ptr %arg, ptr %arg1) local_unnamed_addr #0 { +bb: + br label %bb2 + +bb2: ; preds = %bb4, %bb + %phi = phi ptr [ %arg, %bb ], [ %phi3, %bb4 ] + %phi3 = phi ptr [ %arg1, %bb ], [ null, %bb4 ] + br label %bb4 + +bb4: ; preds = %bb4, %bb2 + %phi5 = phi ptr [ %getelementptr15, %bb4 ], [ %phi, %bb2 ] + %phi6 = phi ptr [ %getelementptr14, %bb4 ], [ %phi3, %bb2 ] + %load = load i64, ptr %phi5, align 8 + store i64 %load, ptr %phi, align 8 + store i64 0, ptr %phi3, align 8 + %load7 = load ptr, ptr %phi6, align 8 + %load8 = load ptr, ptr %phi5, align 8 + store ptr %load8, ptr %phi6, align 8 + %getelementptr = getelementptr %struct.foo, ptr %phi5, i64 0, i32 1 + %load9 = load ptr, ptr %phi5, align 8 + store ptr %load9, ptr %phi6, align 8 + %load10 = load ptr, ptr %phi5, align 8 + store ptr %load10, ptr %phi6, align 8 + store ptr null, ptr %phi5, align 8 + store ptr %load7, ptr %getelementptr, align 8 + %getelementptr11 = getelementptr %struct.pluto, ptr %phi5, i64 0, i32 1 + %load12 = load ptr, ptr %phi6, align 8 + %getelementptr13 = getelementptr %struct.pluto, ptr %phi6, i64 0, i32 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 2 + store ptr null, ptr %phi6, align 8 + store ptr null, ptr %getelementptr13, align 8 + store ptr %load12, ptr %getelementptr11, align 8 + store ptr null, ptr %phi5, align 8 + %getelementptr14 = getelementptr inbounds %struct.baz, ptr %phi6, i64 1 + %getelementptr15 = getelementptr %struct.baz, ptr %phi5, i64 1 + %icmp = icmp eq ptr %phi6, %phi + br i1 %icmp, label %bb2, label %bb4 +} + +; Function Attrs: memory(readwrite, inaccessiblemem: none) +declare void @foo() local_unnamed_addr #0 + +; Function Attrs: memory(argmem: readwrite) +declare void @pluto() local_unnamed_addr #1 + +attributes #0 = { memory(readwrite, inaccessiblemem: none) } +attributes #1 = { memory(argmem: readwrite) }