Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2457,7 +2457,31 @@ unsigned BundleWidth = VectorizableTree[0].Scalars.size(); - for (TreeEntry &TE : VectorizableTree) { + for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { + TreeEntry &TE = VectorizableTree[I]; + + // We create duplicate tree entries for gather sequences that have multiple + // uses. However, we should only compute the cost of duplicate sequences + // once. For example, if we have a build vector (i.e., insertelement + // sequence) that is used by more than one vector instruction, we only need + // to compute the cost of the insertelement instructions once. The + // redundent instructions will be eliminated by CSE. + // + // We should consider not creating duplicate tree entries for gather + // sequences, and instead add additional edges to the tree representing + // their uses. Since such an approach results in fewer total entries, + // existing heuristics based on tree size may yeild different results. + // + if (TE.NeedToGather) { + auto It = + std::find_if(std::next(VectorizableTree.begin(), I + 1), + VectorizableTree.end(), [TE](TreeEntry &Entry) { + return Entry.NeedToGather && Entry.isSame(TE.Scalars); + }); + if (It != VectorizableTree.end()) + continue; + } + int C = getEntryCost(&TE); DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] << ".\n"); Index: test/Transforms/SLPVectorizer/AArch64/gather-cost.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -0,0 +1,52 @@ +; RUN: opt < %s -S -slp-vectorizer -instcombine -pass-remarks-output=%t | FileCheck %s +; RUN: cat %t | FileCheck -check-prefix=REMARK %s +; RUN: opt < %s -S -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s +; RUN: cat %t | FileCheck -check-prefix=REMARK %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; CHECK-LABEL: @gather_multiple_use( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 %b, i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 %d, i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]]) +; CHECK-NEXT: ret i32 [[TMP10]] +; +; REMARK-LABEL: Function: gather_multiple_use +; REMARK: Args: +; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost ' +; REMARK-NEXT: - Cost: '-7' +; +define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) { + %tmp00 = lshr i32 %a, 15 + %tmp01 = and i32 %tmp00, 65537 + %tmp02 = mul nuw i32 %tmp01, 65535 + %tmp03 = add i32 %tmp02, %a + %tmp04 = xor i32 %tmp03, %tmp02 + %tmp05 = lshr i32 %c, 15 + %tmp06 = and i32 %tmp05, 65537 + %tmp07 = mul nuw i32 %tmp06, 65535 + %tmp08 = add i32 %tmp07, %c + %tmp09 = xor i32 %tmp08, %tmp07 + %tmp10 = lshr i32 %b, 15 + %tmp11 = and i32 %tmp10, 65537 + %tmp12 = mul nuw i32 %tmp11, 65535 + %tmp13 = add i32 %tmp12, %b + %tmp14 = xor i32 %tmp13, %tmp12 + %tmp15 = lshr i32 %d, 15 + %tmp16 = and i32 %tmp15, 65537 + %tmp17 = mul nuw i32 %tmp16, 65535 + %tmp18 = add i32 %tmp17, %d + %tmp19 = xor i32 %tmp18, %tmp17 + %tmp20 = add i32 %tmp09, %tmp04 + %tmp21 = add i32 %tmp20, %tmp14 + %tmp22 = add i32 %tmp21, %tmp19 + ret i32 %tmp22 +} Index: test/Transforms/SLPVectorizer/X86/blending-shuffle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/blending-shuffle.ll +++ test/Transforms/SLPVectorizer/X86/blending-shuffle.ll @@ -141,15 +141,16 @@ ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 +; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1 +; CHECK-NEXT: [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2 ; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] ; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]] -; CHECK-NEXT: ret i8 [[TMP6]] +; CHECK-NEXT: [[X1X1:%.*]] = mul i8 [[X1]], [[X1]] +; CHECK-NEXT: [[X2X2:%.*]] = mul i8 [[X2]], [[X2]] +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]] +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]] +; CHECK-NEXT: [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret i8 [[TMP3]] ; %x0 = extractelement <4 x i8> %x, i32 0 br label %bb1