diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7592,16 +7592,27 @@ // Try to vectorize reductions that use PHINodes. if (PHINode *P = dyn_cast(it)) { // Check that the PHI is a reduction PHI. - if (P->getNumIncomingValues() != 2) - return Changed; + if (P->getNumIncomingValues() == 2) { + // Try to match and vectorize a horizontal reduction. + if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, + TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; + } + } + // Try to vectorize the incoming values of the PHI, to catch reductions + // that feed into PHIs. + for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) { + // Skip if the incoming block is the current BB for now. + // TODO: Collect the skipped incoming values and try to vectorize them + // after processing BB. + if (BB == P->getIncomingBlock(I)) + continue; - // Try to match and vectorize a horizontal reduction. - if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, - TTI)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; + Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I), + P->getIncomingBlock(I), R, TTI); } continue; } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -1632,38 +1632,82 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] ; CHECK: bb: -; CHECK-NEXT: [[L_0:%.*]] = load i32, i32* [[DATA:%.*]], align 4 -; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1 -; CHECK-NEXT: [[L_1:%.*]] = load i32, i32* [[IDX_1]], align 4 -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[L_1]], [[L_0]] +; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1 ; CHECK-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2 -; CHECK-NEXT: [[L_2:%.*]] = load i32, i32* [[IDX_2]], align 4 -; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[L_2]], [[ADD_1]] ; CHECK-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3 -; CHECK-NEXT: [[L_3:%.*]] = load i32, i32* [[IDX_3]], align 4 -; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[L_3]], [[ADD_2]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_3]], [[BB]] ] +; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] ; CHECK-NEXT: ret i32 [[SUM_1]] ; ; STORE-LABEL: @reduction_result_used_in_phi( ; STORE-NEXT: entry: ; STORE-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] ; STORE: bb: -; STORE-NEXT: [[L_0:%.*]] = load i32, i32* [[DATA:%.*]], align 4 -; STORE-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 1 -; STORE-NEXT: [[L_1:%.*]] = load i32, i32* [[IDX_1]], align 4 -; STORE-NEXT: [[ADD_1:%.*]] = add i32 [[L_1]], [[L_0]] +; STORE-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1 ; STORE-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2 -; STORE-NEXT: [[L_2:%.*]] = load i32, i32* [[IDX_2]], align 4 -; STORE-NEXT: [[ADD_2:%.*]] = add i32 [[L_2]], [[ADD_1]] ; STORE-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3 -; STORE-NEXT: [[L_3:%.*]] = load i32, i32* [[IDX_3]], align 4 -; STORE-NEXT: [[ADD_3:%.*]] = add i32 [[L_3]], [[ADD_2]] +; STORE-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>* +; STORE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; STORE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) ; STORE-NEXT: br label [[EXIT]] ; STORE: exit: -; STORE-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_3]], [[BB]] ] +; STORE-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] +; STORE-NEXT: ret i32 [[SUM_1]] +; +entry: + br i1 %b, label %bb, label %exit + +bb: + %l.0 = load i32, i32* %data, align 4 + %idx.1 = getelementptr inbounds i32, i32* %data, i64 1 + %l.1 = load i32, i32* %idx.1, align 4 + %add.1 = add i32 %l.1, %l.0 + %idx.2 = getelementptr inbounds i32, i32* %data, i64 2 + %l.2 = load i32, i32* %idx.2, align 4 + %add.2 = add i32 %l.2, %add.1 + %idx.3 = getelementptr inbounds i32, i32* %data, i64 3 + %l.3 = load i32, i32* %idx.3, align 4 + %add.3 = add i32 %l.3, %add.2 + br label %exit + +exit: + %sum.1 = phi i32 [ 0, %entry ], [ %add.3, %bb] + ret i32 %sum.1 +} + +define i32 @reduction_result_used_in_phi_loop(i32* nocapture readonly %data, i1 zeroext %b) { +; CHECK-LABEL: @reduction_result_used_in_phi_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] +; CHECK: bb: +; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1 +; CHECK-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2 +; CHECK-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] +; CHECK-NEXT: ret i32 [[SUM_1]] +; +; STORE-LABEL: @reduction_result_used_in_phi_loop( +; STORE-NEXT: entry: +; STORE-NEXT: br i1 [[B:%.*]], label [[BB:%.*]], label [[EXIT:%.*]] +; STORE: bb: +; STORE-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i64 1 +; STORE-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 2 +; STORE-NEXT: [[IDX_3:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i64 3 +; STORE-NEXT: [[TMP0:%.*]] = bitcast i32* [[DATA]] to <4 x i32>* +; STORE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; STORE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]]) +; STORE-NEXT: br label [[EXIT]] +; STORE: exit: +; STORE-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[BB]] ] ; STORE-NEXT: ret i32 [[SUM_1]] ; entry: