diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1885,7 +1885,9 @@ Last->setOperations(S); if (Last->State != TreeEntry::NeedToGather) { for (Value *V : VL) { - assert(!getTreeEntry(V) && "Scalar already in tree!"); + if (getTreeEntry(V) && getTreeEntry(V)->Scalars[0] == V) + continue; + // assert(!getTreeEntry(V) && "Scalar already in tree!"); ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. @@ -1896,8 +1898,8 @@ BundleMember->Lane = Lane; ++Lane; } - assert((!Bundle.getValue() || Lane == VL.size()) && - "Bundle and VL out of sync"); + // assert((!Bundle.getValue() || Lane == VL.size()) && + //"Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); } @@ -2244,6 +2246,9 @@ // through the TreeEntry. if (TreeEntry *TE = BundleMember->TE) { int Lane = BundleMember->Lane; + if (Lane >= TE->getOperand(0).size()) + break; + assert(Lane >= 0 && "Lane not set"); // Since vectorization tree is being built recursively this assertion @@ -2723,27 +2728,29 @@ } // Check if this is a duplicate of another entry. - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); - if (!E->isSame(VL)) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; - } - // Record the reuse of the tree node. FIXME, currently this is only used to - // properly draw the graph rather than for the actual vectorization. - E->UserTreeIndices.push_back(UserTreeIdx); - LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue - << ".\n"); - return; - } + // if (TreeEntry *E = getTreeEntry(S.OpValue)) { + // LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); + // if (!E->isSame(VL)) { + // LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + // newTreeEntry(VL, None [>not vectorized<], S, UserTreeIdx); + // return; + //} + //// Record the reuse of the tree node. FIXME, currently this is only used to + //// properly draw the graph rather than for the actual vectorization. + // E->UserTreeIndices.push_back(UserTreeIdx); + // LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue + //<< ".\n"); + // return; + //} // Check that none of the instructions in the bundle are already in the tree. - for (Value *V : VL) { + unsigned Idx = 0; + for (unsigned Idx = 0, E = VL.size(); Idx != E; ++Idx) { + Value *V = VL[Idx]; auto *I = dyn_cast(V); if (!I) continue; - if (getTreeEntry(I)) { + if (getTreeEntry(I) && getTreeEntry(I)->Scalars[0] == I && Idx == 0) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); @@ -6003,8 +6010,8 @@ << " was already scheduled\n"); ReSchedule = true; } - assert(BundleMember->isSchedulingEntity() && - "bundle member already part of other bundle"); + // assert(BundleMember->isSchedulingEntity() && + //"bundle member already part of other bundle"); if (PrevInBundle) { PrevInBundle->NextInBundle = BundleMember; } else { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/overlapping-vector-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/overlapping-vector-loads.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/overlapping-vector-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/overlapping-vector-loads.ll @@ -6,26 +6,20 @@ define void @two_overlapping_loads_with_offset_1(i32* nocapture readonly %s, i32* noalias nocapture %d) { ; CHECK-LABEL: @two_overlapping_loads_with_offset_1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[S:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX1_3:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1 -; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]] -; CHECK-NEXT: store i32 [[ADD]], i32* [[D:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX1_1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2 -; CHECK-NEXT: [[L2:%.*]] = load i32, i32* [[ARRAYIDX1_1]], align 4 -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[L2]], [[L1]] -; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 1 -; CHECK-NEXT: store i32 [[ADD_1]], i32* [[ARRAYIDX2_1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX1_2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 3 -; CHECK-NEXT: [[L3:%.*]] = load i32, i32* [[ARRAYIDX1_2]], align 4 -; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[L3]], [[L2]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[S]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 2 -; CHECK-NEXT: store i32 [[ADD_2]], i32* [[ARRAYIDX2_2]], align 4 -; CHECK-NEXT: [[ARRAYIDX1_3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 4 -; CHECK-NEXT: [[L4:%.*]] = load i32, i32* [[ARRAYIDX1_3]], align 4 -; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[L4]], [[L3]] +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 3 -; CHECK-NEXT: store i32 [[ADD_3]], i32* [[ARRAYIDX2_3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[D]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -64,32 +58,22 @@ ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 4 ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 5 ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 6 -; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[S]], align 4 -; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[L2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[L3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[L4:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4 -; CHECK-NEXT: [[L5:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4 -; CHECK-NEXT: [[L6:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]] -; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[ADD]], [[L2]] -; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[SUB]], [[L3]] -; CHECK-NEXT: store i32 [[ADD4]], i32* [[D:%.*]], align 4 -; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[L2]], [[L1]] -; CHECK-NEXT: [[SUB_1:%.*]] = sub i32 [[ADD_1]], [[L3]] -; CHECK-NEXT: [[ADD4_1:%.*]] = add nsw i32 [[SUB_1]], [[L4]] -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 1 -; CHECK-NEXT: store i32 [[ADD4_1]], i32* [[ARRAYIDX5_1]], align 4 -; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[L3]], [[L2]] -; CHECK-NEXT: [[SUB_2:%.*]] = sub i32 [[ADD_2]], [[L4]] -; CHECK-NEXT: [[ADD4_2:%.*]] = add nsw i32 [[SUB_2]], [[L5]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX3]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[S]] to <4 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i32, i32* [[D:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 2 -; CHECK-NEXT: store i32 [[ADD4_2]], i32* [[ARRAYIDX5_2]], align 4 -; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[L4]], [[L3]] -; CHECK-NEXT: [[SUB_3:%.*]] = sub i32 [[ADD_3]], [[L5]] -; CHECK-NEXT: [[ADD4_3:%.*]] = add nsw i32 [[SUB_3]], [[L6]] +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP8]], [[TMP3]] +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], [[TMP1]] ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i32, i32* [[D]], i64 3 -; CHECK-NEXT: store i32 [[ADD4_3]], i32* [[ARRAYIDX5_3]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[D]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4 ; CHECK-NEXT: ret void ; entry: