diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4083,7 +4083,7 @@ SmallVector ExternalUserReorderIndices = findExternalStoreUsersReorderIndices(TE.get()); if (!ExternalUserReorderIndices.empty()) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); ExternalUserReorderMap.try_emplace(TE.get(), std::move(ExternalUserReorderIndices)); } @@ -4103,7 +4103,7 @@ OpcodeMask.set(Lane); // If this pattern is supported by the target then we consider the order. if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { - VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get()); AltShufflesToOrders.try_emplace(TE.get(), OrdersType()); } // TODO: Check the reverse order too. @@ -4141,7 +4141,7 @@ }); // Reorder the graph nodes according to their vectorization factor. - for (unsigned VF = VectorizableTree.front()->Scalars.size(); VF > 1; + for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1; VF /= 2) { auto It = VFToOrderedEntries.find(VF); if (It == VFToOrderedEntries.end()) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-vf-to-resize.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=slp-vectorizer -mcpu=skx -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s + +define void @main(ptr %0) { +; CHECK-LABEL: @main( +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[TMP0:%.*]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> zeroinitializer, [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x double> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = fcmp oeq <4 x double> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], double 0.000000e+00, double 0.000000e+00 +; CHECK-NEXT: store double [[TMP10]], ptr null, align 8 +; CHECK-NEXT: ret void +; + %.unpack = load double, ptr %0, align 8 + %.elt1 = getelementptr { double, double }, ptr %0, i64 0, i32 1 + %.unpack2 = load double, ptr %.elt1, align 8 + %2 = fadd double %.unpack, 0.000000e+00 + %3 = fsub double 0.000000e+00, %.unpack2 + %4 = fmul double %2, 0.000000e+00 + %5 = call double @llvm.fabs.f64(double %4) + %6 = fmul double %3, 0.000000e+00 + %7 = call double @llvm.fabs.f64(double %6) + %8 = fmul double %3, 0.000000e+00 + %9 = call double @llvm.fabs.f64(double %8) + %10 = fmul double %2, 0.000000e+00 + %11 = call double @llvm.fabs.f64(double %10) + %12 = fcmp oeq double %5, 0.000000e+00 + %13 = fcmp oeq double %7, 0.000000e+00 + %14 = or i1 %12, %13 + %15 = fcmp oeq double %11, 0.000000e+00 + %16 = or i1 %14, %15 + %17 = fcmp oeq double %9, 0.000000e+00 + %18 = or i1 %16, %17 + %19 = select i1 %18, double 0.000000e+00, double 0.000000e+00 + store double %19, ptr null, align 8 + ret void +} + +declare double @llvm.fabs.f64(double)