Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -139,6 +139,10 @@ "slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits")); +static cl::opt AllowNonPowerOf2Elts( + "slp-allow-non-power-of-2", cl::init(true), cl::Hidden, + cl::desc("Allow non-power-of-2 elements per vector")); + static cl::opt RecursionMaxDepth( "slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree")); @@ -2180,13 +2184,18 @@ if (NumUniqueScalarValues == VL.size()) { ReuseShuffleIndicies.clear(); } else { - LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (NumUniqueScalarValues <= 1 || - !llvm::isPowerOf2_32(NumUniqueScalarValues)) { - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + if (NumUniqueScalarValues <= 1) { + LLVM_DEBUG(dbgs() << "SLP: Less than 2 scalars used in bundle.\n"); + newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx); + return; + } + if (!isPowerOf2_32(NumUniqueScalarValues) && !AllowNonPowerOf2Elts) { + LLVM_DEBUG(dbgs() << "SLP: Non-power-of-2 elements in bundle.\n"); newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx); return; } + // Vectorization requires shuffling to duplicate scalar values. + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); VL = UniqueValues; } Index: llvm/test/Transforms/SLPVectorizer/X86/cse.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -18,21 +18,15 @@ ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <3 x i32> ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], +; CHECK-NEXT: [[TMP2:%.*]] = fmul <3 x double> [[SHUFFLE]], +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <3 x double> [[TMP2]], <3 x double> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[SHUFFLE1]], ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: Index: llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -54,15 +54,16 @@ ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <3 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <3 x float>, <3 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP5]], i32 2 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP5]], i32 3 ; CHECK-NEXT: ret <4 x float> [[I3]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0