Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6456,7 +6456,7 @@ // to a nearby power-of-2. Can safely generate oversized // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); - if (NumReducedVals < 4) + if (NumReducedVals < 2) return false; unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); @@ -6484,7 +6484,7 @@ SmallVector IgnoreList; for (auto &V : ReductionOps) IgnoreList.append(V.begin(), V.end()); - while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { + while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 1) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); Optional> Order = V.bestOrder(); Index: llvm/test/Feature/weak_constant.ll =================================================================== --- llvm/test/Feature/weak_constant.ll +++ llvm/test/Feature/weak_constant.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -O3 -S > %t -; RUN: grep undef %t | count 1 +; RUN: grep undef %t | count 2 ; RUN: grep 5 %t | count 1 ; RUN: grep 7 %t | count 1 ; RUN: grep 9 %t | count 1 Index: llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll @@ -19,10 +19,10 @@ ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP6]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP7]], 0.000000e+00 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] ; CHECK: for.end27: ; CHECK-NEXT: ret void @@ -64,10 +64,10 @@ ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP6]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[TMP7]], 0.000000e+00 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] ; CHECK: for.end27: ; CHECK-NEXT: ret void Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -209,10 +209,8 @@ ; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 ; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 ; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 -; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 -; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 -; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 -; MAX-COST-NEXT: [[P11:%.*]] = icmp eq i8 [[P10]], 0 +; MAX-COST-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5) to <2 x i8>*), align 1 +; MAX-COST-NEXT: [[TMP3:%.*]] = icmp eq <2 x i8> [[TMP2]], zeroinitializer ; MAX-COST-NEXT: [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 ; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 ; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 @@ -220,19 +218,20 @@ ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0 -; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1 -; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2 -; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3 -; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> -; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] -; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 +; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> undef, i1 [[TMP4]], i32 0 +; MAX-COST-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMP6]], i32 1 +; MAX-COST-NEXT: [[TMP8:%.*]] = insertelement <4 x i1> [[TMP7]], i1 [[P5]], i32 2 +; MAX-COST-NEXT: [[TMP9:%.*]] = insertelement <4 x i1> [[TMP8]], i1 [[P7]], i32 3 +; MAX-COST-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> , <2 x i32> +; MAX-COST-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) +; MAX-COST-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> undef, <2 x i32> +; MAX-COST-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP11]], [[RDX_SHUF]] +; MAX-COST-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[BIN_RDX]], i32 0 +; MAX-COST-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP12]], [[TMP13]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[OP_RDX]], -5 ; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] ; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 Index: llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll +++ llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll @@ -18,21 +18,29 @@ ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 -; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]] -; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP6:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i32 [[TMP5]], i32 [[SELECT1]] -; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4 +; GFX9-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <2 x i32>*), align 8 +; GFX9-NEXT: [[LOAD5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <2 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i32> [[TMP4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i32> [[TMP4]], <2 x i32> [[RDX_SHUF]] +; GFX9-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT]], i32 0 +; GFX9-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0 +; GFX9-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[TMP2]], i32 1 +; GFX9-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[LOAD5]], i32 0 +; GFX9-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP3]], i32 1 +; GFX9-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[TMP7]], [[TMP9]] +; GFX9-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP10]], <2 x i32> [[TMP7]], <2 x i32> [[TMP9]] +; GFX9-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; GFX9-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 +; GFX9-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP12]] +; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP12]] +; GFX9-NEXT: [[LOAD6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 +; GFX9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], i32 [[OP_EXTRA]], i32 [[LOAD6]] +; GFX9-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[TMP15]], i32 3, i32 4 ; GFX9-NEXT: store i32 [[STORE_SELECT]], i32* @var, align 8 -; GFX9-NEXT: ret i32 [[OP_EXTRA]] +; GFX9-NEXT: ret i32 [[SELECT5]] ; %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -65,21 +73,29 @@ ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([32 x i64]* @arr64 to <2 x i64>*), align 16 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -; GFX9-NEXT: [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]] -; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]] -; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i64> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP4]], <4 x i64> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i64> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP6:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], i64 [[TMP5]], i64 [[SELECT1]] -; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4 +; GFX9-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <2 x i64>*), align 16 +; GFX9-NEXT: [[LOAD5:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 4), align 16 +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> undef, <2 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i64> [[TMP4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[TMP4]], <2 x i64> [[RDX_SHUF]] +; GFX9-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; GFX9-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0 +; GFX9-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[TMP2]], i32 1 +; GFX9-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[LOAD5]], i32 0 +; GFX9-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP3]], i32 1 +; GFX9-NEXT: [[TMP10:%.*]] = icmp slt <2 x i64> [[TMP7]], [[TMP9]] +; GFX9-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP10]], <2 x i64> [[TMP7]], <2 x i64> [[TMP9]] +; GFX9-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1 +; GFX9-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 +; GFX9-NEXT: [[TMP14:%.*]] = icmp slt i64 [[TMP13]], [[TMP12]] +; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i64 [[TMP13]], i64 [[TMP12]] +; GFX9-NEXT: [[LOAD6:%.*]] = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 5), align 8 +; GFX9-NEXT: [[CMP5:%.*]] = icmp slt i64 [[OP_EXTRA]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], i64 [[OP_EXTRA]], i64 [[LOAD6]] +; GFX9-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[TMP15]], i64 3, i64 4 ; GFX9-NEXT: store i64 [[STORE_SELECT]], i64* @var64, align 8 -; GFX9-NEXT: ret i64 [[OP_EXTRA]] +; GFX9-NEXT: ret i64 [[SELECT5]] ; %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16 %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8 @@ -112,21 +128,29 @@ ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([32 x float]* @farr to <2 x float>*), align 16 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast ogt float [[TMP2]], [[TMP3]] -; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], float [[TMP2]], float [[TMP3]] -; GFX9-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <4 x float>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP4]], <4 x float> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast ogt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast ogt float [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], float [[TMP5]], float [[SELECT1]] -; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], float 3.000000e+00, float 4.000000e+00 +; GFX9-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 2) to <2 x float>*), align 8 +; GFX9-NEXT: [[LOAD5:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 4), align 16 +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> undef, <2 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <2 x float> [[TMP4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x float> [[TMP4]], <2 x float> [[RDX_SHUF]] +; GFX9-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[RDX_MINMAX_SELECT]], i32 0 +; GFX9-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP5]], i32 0 +; GFX9-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP2]], i32 1 +; GFX9-NEXT: [[TMP8:%.*]] = insertelement <2 x float> undef, float [[LOAD5]], i32 0 +; GFX9-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP3]], i32 1 +; GFX9-NEXT: [[TMP10:%.*]] = fcmp fast ogt <2 x float> [[TMP7]], [[TMP9]] +; GFX9-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP10]], <2 x float> [[TMP7]], <2 x float> [[TMP9]] +; GFX9-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; GFX9-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; GFX9-NEXT: [[TMP14:%.*]] = fcmp fast ogt float [[TMP13]], [[TMP12]] +; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], float [[TMP13]], float [[TMP12]] +; GFX9-NEXT: [[LOAD6:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 5), align 4 +; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast ogt float [[OP_EXTRA]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], float [[OP_EXTRA]], float [[LOAD6]] +; GFX9-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[TMP15]], float 3.000000e+00, float 4.000000e+00 ; GFX9-NEXT: store float [[STORE_SELECT]], float* @fvar, align 8 -; GFX9-NEXT: ret float [[OP_EXTRA]] +; GFX9-NEXT: ret float [[SELECT5]] ; %load1 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 0), align 16 %load2 = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @farr, i64 0, i64 1), align 4 @@ -159,21 +183,29 @@ ; GFX9-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([32 x double]* @darr to <2 x double>*), align 16 ; GFX9-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 ; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; GFX9-NEXT: [[CMP1:%.*]] = fcmp fast olt double [[TMP2]], [[TMP3]] -; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], double [[TMP2]], double [[TMP3]] -; GFX9-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <4 x double>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x double> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x double> [[TMP4]], <4 x double> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x double> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x double> [[RDX_MINMAX_SELECT]], <4 x double> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP6:%.*]] = fcmp fast olt double [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP6]], double [[TMP5]], double [[SELECT1]] -; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], double 3.000000e+00, double 4.000000e+00 +; GFX9-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 2) to <2 x double>*), align 8 +; GFX9-NEXT: [[LOAD5:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 4), align 16 +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> undef, <2 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <2 x double> [[TMP4]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[TMP4]], <2 x double> [[RDX_SHUF]] +; GFX9-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 +; GFX9-NEXT: [[TMP6:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0 +; GFX9-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP2]], i32 1 +; GFX9-NEXT: [[TMP8:%.*]] = insertelement <2 x double> undef, double [[LOAD5]], i32 0 +; GFX9-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TMP3]], i32 1 +; GFX9-NEXT: [[TMP10:%.*]] = fcmp fast olt <2 x double> [[TMP7]], [[TMP9]] +; GFX9-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP10]], <2 x double> [[TMP7]], <2 x double> [[TMP9]] +; GFX9-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 1 +; GFX9-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP11]], i32 0 +; GFX9-NEXT: [[TMP14:%.*]] = fcmp fast olt double [[TMP13]], [[TMP12]] +; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], double [[TMP13]], double [[TMP12]] +; GFX9-NEXT: [[LOAD6:%.*]] = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 5), align 4 +; GFX9-NEXT: [[CMP5:%.*]] = fcmp fast olt double [[OP_EXTRA]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], double [[OP_EXTRA]], double [[LOAD6]] +; GFX9-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[TMP15]], double 3.000000e+00, double 4.000000e+00 ; GFX9-NEXT: store double [[STORE_SELECT]], double* @dvar, align 8 -; GFX9-NEXT: ret double [[OP_EXTRA]] +; GFX9-NEXT: ret double [[SELECT5]] ; %load1 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 0), align 16 %load2 = load double, double* getelementptr inbounds ([32 x double], [32 x double]* @darr, i64 0, i64 1), align 4 @@ -205,22 +237,29 @@ ; GFX9-LABEL: @smax_wdiff_valuenum( ; GFX9-NEXT: [[VLOAD:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 ; GFX9-NEXT: [[ELT1:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0 -; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[ELT1]], [[V1:%.*]] -; GFX9-NEXT: [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0 -; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]] -; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; GFX9-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP4]], i32 [[TMP3]], i32 [[SELECT1]] -; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4 +; GFX9-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <2 x i32>*), align 8 +; GFX9-NEXT: [[LOAD5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 4), align 16 +; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> +; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i32> [[TMP2]], [[RDX_SHUF]] +; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i32> [[TMP2]], <2 x i32> [[RDX_SHUF]] +; GFX9-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT]], i32 0 +; GFX9-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[TMP3]], i32 0 +; GFX9-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[ELT1]], i32 1 +; GFX9-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[LOAD5]], i32 0 +; GFX9-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[V1:%.*]], i32 1 +; GFX9-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[TMP5]], [[TMP7]] +; GFX9-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x i32> [[TMP5]], <2 x i32> [[TMP7]] +; GFX9-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP9]], i32 1 +; GFX9-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP9]], i32 0 +; GFX9-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP10]] +; GFX9-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP10]] +; GFX9-NEXT: [[LOAD6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 5), align 4 +; GFX9-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[LOAD6]] +; GFX9-NEXT: [[SELECT5:%.*]] = select i1 [[CMP5]], i32 [[OP_EXTRA]], i32 [[LOAD6]] +; GFX9-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[TMP13]], i32 3, i32 4 ; GFX9-NEXT: store i32 [[STOREVAL]], i32* @var, align 8 -; GFX9-NEXT: ret i32 [[OP_EXTRA]] +; GFX9-NEXT: ret i32 [[SELECT5]] ; %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 %elt1 = extractelement <2 x i32> %vload, i32 0 Index: llvm/test/Transforms/SLPVectorizer/X86/PR34635.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/PR34635.ll +++ llvm/test/Transforms/SLPVectorizer/X86/PR34635.ll @@ -2,7 +2,7 @@ ; RUN: opt < %s -mtriple=x86_64-unknown-linux -slp-vectorizer -S -mcpu=corei7 | FileCheck %s define i32 @main() { -; CHECK-LABEL: define {{[^@]+}}@main( +; CHECK-LABEL: @main( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[T:%.*]] = alloca <8 x i32>, align 32 ; CHECK-NEXT: [[T1:%.*]] = bitcast <8 x i32>* [[T]] to [8 x i32]* @@ -18,15 +18,17 @@ ; CHECK-NEXT: [[T11:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* [[T1]], i64 0, i64 7 ; CHECK-NEXT: store <8 x i32> , <8 x i32>* [[T]], align 32 ; CHECK-NEXT: [[T12:%.*]] = bitcast i32* [[T2]] to i8* -; CHECK-NEXT: [[T13:%.*]] = load i32, i32* [[T4]], align 32 -; CHECK-NEXT: [[T14:%.*]] = load i32, i32* [[T5]], align 4 -; CHECK-NEXT: [[T15:%.*]] = icmp slt i32 [[T14]], [[T13]] -; CHECK-NEXT: [[T16:%.*]] = select i1 [[T15]], i32 [[T14]], i32 [[T13]] -; CHECK-NEXT: [[T17:%.*]] = zext i1 [[T15]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[T4]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 32 +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i32> [[TMP1]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i32> [[TMP1]], <2 x i32> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: [[T17:%.*]] = zext i1 undef to i32 ; CHECK-NEXT: [[T18:%.*]] = load i32, i32* [[T6]], align 8 -; CHECK-NEXT: [[T19:%.*]] = icmp slt i32 [[T18]], [[T16]] -; CHECK-NEXT: [[T20:%.*]] = select i1 [[T19]], i32 [[T18]], i32 [[T16]] -; CHECK-NEXT: [[T21:%.*]] = select i1 [[T19]], i32 2, i32 [[T16]] +; CHECK-NEXT: [[T19:%.*]] = icmp slt i32 [[T18]], [[TMP2]] +; CHECK-NEXT: [[T20:%.*]] = select i1 [[T19]], i32 [[T18]], i32 [[TMP2]] +; CHECK-NEXT: [[T21:%.*]] = select i1 [[T19]], i32 2, i32 [[TMP2]] ; CHECK-NEXT: [[T22:%.*]] = load i32, i32* [[T7]], align 4 ; CHECK-NEXT: [[T23:%.*]] = icmp slt i32 [[T22]], [[T20]] ; CHECK-NEXT: [[T24:%.*]] = select i1 [[T23]], i32 [[T22]], i32 [[T20]] Index: llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll @@ -15,10 +15,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x fp128> [[TMP0]], fp128 [[D:%.*]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = call <2 x fp128> @llvm.fabs.v2f128(<2 x fp128> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <2 x fp128> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: [[OR_COND39:%.*]] = or i1 [[TMP4]], [[TMP5]] -; CHECK-NEXT: br i1 [[OR_COND39]], label [[IF_THEN13:%.*]], label [[IF_END24:%.*]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i1> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[IF_THEN13:%.*]], label [[IF_END24:%.*]] ; CHECK: if.then13: ; CHECK-NEXT: unreachable ; CHECK: if.end24: Index: llvm/test/Transforms/SLPVectorizer/X86/hadd.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -78,34 +78,13 @@ } define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) { -; SSE-LABEL: @test_v2i64( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; SSE-NEXT: ret <2 x i64> [[TMP3]] -; -; SLM-LABEL: @test_v2i64( -; SLM-NEXT: [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1 -; SLM-NEXT: [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1 -; SLM-NEXT: [[R0:%.*]] = add i64 [[A0]], [[A1]] -; SLM-NEXT: [[R1:%.*]] = add i64 [[B0]], [[B1]] -; SLM-NEXT: [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0 -; SLM-NEXT: [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1 -; SLM-NEXT: ret <2 x i64> [[R01]] -; -; AVX-LABEL: @test_v2i64( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -; AVX-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; AVX-NEXT: ret <2 x i64> [[TMP3]] -; -; AVX512-LABEL: @test_v2i64( -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; AVX512-NEXT: ret <2 x i64> [[TMP3]] +; CHECK-LABEL: @test_v2i64( +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <2 x i64> [[RDX_SHUF1]], [[A]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[B:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[RDX_SHUF]], [[B]] +; CHECK-NEXT: [[R01:%.*]] = shufflevector <2 x i64> [[BIN_RDX2]], <2 x i64> [[BIN_RDX]], <2 x i32> +; CHECK-NEXT: ret <2 x i64> [[R01]] ; %a0 = extractelement <2 x i64> %a, i32 0 %a1 = extractelement <2 x i64> %a, i32 1 Index: llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -40,25 +40,28 @@ ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]] -; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] -; THRESHOLD-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]] -; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]] -; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]] -; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]] -; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16 +; THRESHOLD-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] +; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]] +; THRESHOLD-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 +; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 +; THRESHOLD-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], [[TMP5]] +; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <2 x i32> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP7]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[BIN_RDX]], i32 0 +; THRESHOLD-NEXT: [[TMP10:%.*]] = fadd fast float [[TMP9]], [[TMP8]] +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP10]], [[MUL4_1]] +; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[MUL4]] +; THRESHOLD-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], [[ADD_1]] +; THRESHOLD-NEXT: [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]] +; THRESHOLD-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 +; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP11]], [[OP_EXTRA3]] ; THRESHOLD-NEXT: store float [[ADD19_3]], float* @res, align 4 ; THRESHOLD-NEXT: ret float [[ADD19_3]] ; @@ -922,9 +925,9 @@ ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 -; THRESHOLD-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 +; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <2 x float>* +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 ; THRESHOLD-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 ; THRESHOLD-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 @@ -982,9 +985,11 @@ ; THRESHOLD-NEXT: [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]] ; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0 ; THRESHOLD-NEXT: [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; THRESHOLD-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] -; THRESHOLD-NEXT: ret float [[TMP12]] +; THRESHOLD-NEXT: [[RDX_SHUF18:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> undef, <2 x i32> +; THRESHOLD-NEXT: [[BIN_RDX19:%.*]] = fadd fast <2 x float> [[TMP1]], [[RDX_SHUF18]] +; THRESHOLD-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[BIN_RDX19]], i32 0 +; THRESHOLD-NEXT: [[OP_RDX20:%.*]] = fadd fast float [[OP_RDX17]], [[TMP11]] +; THRESHOLD-NEXT: ret float [[OP_RDX20]] ; entry: %arrayidx = getelementptr inbounds float, float* %x, i64 1 Index: llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -486,54 +486,56 @@ ; SSE-NEXT: ret i32 [[TMP14]] ; ; AVX-LABEL: @maxi8_mutiple_uses( -; AVX-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> -; AVX-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]] -; AVX-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]] +; AVX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 +; AVX-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> +; AVX-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i32> [[TMP2]], [[RDX_SHUF4]] +; AVX-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i32> [[TMP2]], <2 x i32> [[RDX_SHUF4]] +; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT6]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; AVX-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]] ; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; AVX-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; AVX-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; AVX-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP7]] -; AVX-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] -; AVX-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP5]] -; AVX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 [[TMP5]] -; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP12]] -; AVX-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA]], i32 [[TMP12]] -; AVX-NEXT: [[TMP15:%.*]] = select i1 [[TMP4]], i32 3, i32 4 -; AVX-NEXT: store i32 [[TMP15]], i32* @var, align 8 -; AVX-NEXT: ret i32 [[TMP14]] +; AVX-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; AVX-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], [[TMP5]] +; AVX-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 [[TMP5]] +; AVX-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP3]] +; AVX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP3]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; AVX-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP10]] +; AVX-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[OP_EXTRA]], i32 [[TMP10]] +; AVX-NEXT: [[TMP13:%.*]] = select i1 undef, i32 3, i32 4 +; AVX-NEXT: store i32 [[TMP13]], i32* @var, align 8 +; AVX-NEXT: ret i32 [[TMP12]] ; ; AVX2-LABEL: @maxi8_mutiple_uses( -; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 -; AVX2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] -; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> -; AVX2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]] -; AVX2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]] +; AVX2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 +; AVX2-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> +; AVX2-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i32> [[TMP2]], [[RDX_SHUF4]] +; AVX2-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i32> [[TMP2]], <2 x i32> [[RDX_SHUF4]] +; AVX2-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT6]], i32 0 +; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 +; AVX2-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; AVX2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]] +; AVX2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]] ; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; AVX2-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; AVX2-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP7]] -; AVX2-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP7]] -; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP5]] -; AVX2-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 [[TMP5]] -; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 -; AVX2-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP12]] -; AVX2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA]], i32 [[TMP12]] -; AVX2-NEXT: [[TMP15:%.*]] = select i1 [[TMP4]], i32 3, i32 4 -; AVX2-NEXT: store i32 [[TMP15]], i32* @var, align 8 -; AVX2-NEXT: ret i32 [[TMP14]] +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; AVX2-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[TMP6]], [[TMP5]] +; AVX2-NEXT: [[TMP8:%.*]] = select i1 [[TMP7]], i32 [[TMP6]], i32 [[TMP5]] +; AVX2-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[TMP8]], [[TMP3]] +; AVX2-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP9]], i32 [[TMP8]], i32 [[TMP3]] +; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[OP_EXTRA]], [[TMP10]] +; AVX2-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[OP_EXTRA]], i32 [[TMP10]] +; AVX2-NEXT: [[TMP13:%.*]] = select i1 undef, i32 3, i32 4 +; AVX2-NEXT: store i32 [[TMP13]], i32* @var, align 8 +; AVX2-NEXT: ret i32 [[TMP12]] ; ; SKX-LABEL: @maxi8_mutiple_uses( ; SKX-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 @@ -627,21 +629,22 @@ ; AVX: pp: ; AVX-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] ; AVX-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; AVX-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8 ; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> ; AVX-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]] ; AVX-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]] ; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; AVX-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; AVX-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; AVX-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; AVX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; AVX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; AVX-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] -; AVX-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]] -; AVX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP5]] +; AVX-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; AVX-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <2 x i32> +; AVX-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i32> [[TMP7]], [[RDX_SHUF4]] +; AVX-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i32> [[TMP7]], <2 x i32> [[RDX_SHUF4]] +; AVX-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT6]], i32 0 +; AVX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +; AVX-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] +; AVX-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[OP_RDX]], [[TMP5]] +; AVX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[OP_RDX]], i32 [[TMP5]] ; AVX-NEXT: ret i32 [[OP_EXTRA]] ; ; AVX2-LABEL: @maxi8_wrong_parent( @@ -652,21 +655,22 @@ ; AVX2: pp: ; AVX2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i32 [[TMP2]], i32 [[TMP3]] ; AVX2-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; AVX2-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8 ; AVX2-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> ; AVX2-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]] ; AVX2-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]] ; AVX2-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; AVX2-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; AVX2-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; AVX2-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; AVX2-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; AVX2-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; AVX2-NEXT: [[TMP13:%.*]] = select i1 [[TMP12]], i32 [[TMP11]], i32 [[TMP8]] -; AVX2-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP13]], [[TMP5]] -; AVX2-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP14]], i32 [[TMP13]], i32 [[TMP5]] +; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; AVX2-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <2 x i32> +; AVX2-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i32> [[TMP7]], [[RDX_SHUF4]] +; AVX2-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i32> [[TMP7]], <2 x i32> [[RDX_SHUF4]] +; AVX2-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT6]], i32 0 +; AVX2-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +; AVX2-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 [[TMP9]] +; AVX2-NEXT: [[TMP11:%.*]] = icmp sgt i32 [[OP_RDX]], [[TMP5]] +; AVX2-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[OP_RDX]], i32 [[TMP5]] ; AVX2-NEXT: ret i32 [[OP_EXTRA]] ; ; SKX-LABEL: @maxi8_wrong_parent( @@ -677,29 +681,30 @@ ; SKX-NEXT: br label [[PP:%.*]] ; SKX: pp: ; SKX-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; SKX-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6), align 8 -; SKX-NEXT: [[TMP8:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 7), align 4 +; SKX-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 6) to <2 x i32>*), align 8 ; SKX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> ; SKX-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP6]], [[RDX_SHUF]] ; SKX-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP6]], <4 x i32> [[RDX_SHUF]] ; SKX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; SKX-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; SKX-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; SKX-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP9]], [[TMP7]] -; SKX-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 [[TMP7]] -; SKX-NEXT: [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]] -; SKX-NEXT: [[TMP13:%.*]] = insertelement <2 x i1> undef, i1 [[TMP12]], i32 0 -; SKX-NEXT: [[TMP14:%.*]] = insertelement <2 x i1> [[TMP13]], i1 [[TMP5]], i32 1 -; SKX-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP11]], i32 0 -; SKX-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP3]], i32 1 -; SKX-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> undef, i32 [[TMP8]], i32 0 -; SKX-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> [[TMP17]], i32 [[TMP4]], i32 1 -; SKX-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP14]], <2 x i32> [[TMP16]], <2 x i32> [[TMP18]] -; SKX-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[TMP19]], i32 1 -; SKX-NEXT: [[TMP21:%.*]] = extractelement <2 x i32> [[TMP19]], i32 0 -; SKX-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP21]], [[TMP20]] -; SKX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP22]], i32 [[TMP21]], i32 [[TMP20]] +; SKX-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; SKX-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <2 x i32> +; SKX-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i32> [[TMP7]], [[RDX_SHUF4]] +; SKX-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i32> [[TMP7]], <2 x i32> [[RDX_SHUF4]] +; SKX-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[RDX_MINMAX_SELECT6]], i32 0 +; SKX-NEXT: [[TMP10:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +; SKX-NEXT: [[TMP11:%.*]] = insertelement <2 x i1> undef, i1 [[TMP10]], i32 0 +; SKX-NEXT: [[TMP12:%.*]] = insertelement <2 x i1> [[TMP11]], i1 [[TMP5]], i32 1 +; SKX-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP8]], i32 0 +; SKX-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP3]], i32 1 +; SKX-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP9]], i32 0 +; SKX-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP4]], i32 1 +; SKX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP14]], <2 x i32> [[TMP16]] +; SKX-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 +; SKX-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 0 +; SKX-NEXT: [[TMP20:%.*]] = icmp sgt i32 [[TMP19]], [[TMP18]] +; SKX-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP20]], i32 [[TMP19]], i32 [[TMP18]] ; SKX-NEXT: ret i32 [[OP_EXTRA]] ; %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 Index: llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -926,11 +926,11 @@ ; STORE-NEXT: [[TMP3:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>* ; STORE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 ; STORE-NEXT: [[TMP5:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP4]] -; STORE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -; STORE-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 -; STORE-NEXT: [[ADD8:%.*]] = fadd fast double [[TMP6]], [[TMP7]] +; STORE-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> undef, <2 x i32> +; STORE-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP5]], [[RDX_SHUF]] +; STORE-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0 ; STORE-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 [[I_018]] -; STORE-NEXT: store double [[ADD8]], double* [[ARRAYIDX9]], align 8 +; STORE-NEXT: store double [[TMP6]], double* [[ARRAYIDX9]], align 8 ; STORE-NEXT: [[INC]] = add nsw i64 [[I_018]], 1 ; STORE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], [[TMP2]] ; STORE-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]]