diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2575,6 +2575,7 @@ // instructions. If that is the case, the one in Lane 0 will // be used. if (UseScalar != U || + UseEntry->State == TreeEntry::ScatterVectorize || !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -243,20 +243,19 @@ ; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> , <2 x double> undef) -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double*> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B2]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP6]] -; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[IDXB0]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[B2]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], [[TMP10]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 ; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; @@ -320,34 +319,36 @@ ; CHECK-LABEL: @lookahead_limit_users_budget( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 -; CHECK-NEXT: [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0 ; CHECK-NEXT: [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0 ; CHECK-NEXT: [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 -; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double*> poison, double* [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[B]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 -; CHECK-NEXT: [[A0:%.*]] = load double, double* [[IDXA0]], align 8 -; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDXA1]], align 8 -; CHECK-NEXT: [[B2:%.*]] = load double, double* [[IDXB2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> , <2 x double> undef) ; CHECK-NEXT: [[A2:%.*]] = load double, double* [[IDXA2]], align 8 ; CHECK-NEXT: [[B1:%.*]] = load double, double* [[IDXB1]], align 8 -; CHECK-NEXT: [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]] -; CHECK-NEXT: [[SUBC0D0:%.*]] = fsub fast double [[C0]], [[D0]] -; CHECK-NEXT: [[SUBA1B2:%.*]] = fsub fast double [[A1]], [[B2]] -; CHECK-NEXT: [[SUBA2B1:%.*]] = fsub fast double [[A2]], [[B1]] -; CHECK-NEXT: [[ADD0:%.*]] = fadd fast double [[SUBA0B0]], [[SUBC0D0]] -; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[SUBA1B2]], [[SUBA2B1]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <2 x double> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A2]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP11]] ; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 ; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 -; CHECK-NEXT: store double [[ADD0]], double* [[IDXS0]], align 8 -; CHECK-NEXT: store double [[ADD1]], double* [[IDXS1]], align 8 -; CHECK-NEXT: store double [[A1]], double* [[EXT1:%.*]], align 8 -; CHECK-NEXT: store double [[A1]], double* [[EXT2:%.*]], align 8 -; CHECK-NEXT: store double [[A1]], double* [[EXT3:%.*]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: store double [[TMP14]], double* [[EXT1:%.*]], align 8 +; CHECK-NEXT: store double [[TMP14]], double* [[EXT2:%.*]], align 8 +; CHECK-NEXT: store double [[TMP14]], double* [[EXT3:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], double* [[EXT4:%.*]], align 8 ; CHECK-NEXT: store double [[B1]], double* [[EXT5:%.*]], align 8 ; CHECK-NEXT: ret void