Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6669,11 +6669,18 @@ // to a nearby power-of-2. Can safely generate oversized // vectors and rely on the backend to split them to legal sizes. unsigned NumReducedVals = ReducedVals.size(); - if (NumReducedVals < 4) + if (NumReducedVals < 2) return false; - unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); + // Allow 2-way reductions only for comparisons (bool type). Ideally, we + // would allow this for any type, but it may interfere with other + // vectorization attempts. + if (NumReducedVals < 4 && + ReductionRoot->getType()->getScalarSizeInBits() != 1) + return false; + unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); + unsigned MinRdxWidth = Log2_32(ReduxWidth); Value *VectorizedTree = nullptr; // FIXME: Fast-math-flags should be set based on the instructions in the @@ -6709,7 +6716,7 @@ SmallVector IgnoreList; for (auto &V : ReductionOps) IgnoreList.append(V.begin(), V.end()); - while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { + while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > MinRdxWidth) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); Optional> Order = V.bestOrder(); Index: llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll +++ llvm/test/Transforms/SLPVectorizer/X86/fabs-cost-softfp.ll @@ -15,10 +15,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x fp128> [[TMP0]], fp128 [[D:%.*]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = call <2 x fp128> @llvm.fabs.v2f128(<2 x fp128> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <2 x fp128> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 -; CHECK-NEXT: [[OR_COND39:%.*]] = or i1 [[TMP4]], [[TMP5]] -; CHECK-NEXT: br i1 [[OR_COND39]], label [[IF_THEN13:%.*]], label [[IF_END24:%.*]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i1> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[IF_THEN13:%.*]], label [[IF_END24:%.*]] ; CHECK: if.then13: ; CHECK-NEXT: unreachable ; CHECK: if.end24: Index: llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -859,11 +859,13 @@ ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 +; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4 ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4 ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4 ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8 ; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9 @@ -872,8 +874,8 @@ ; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12 ; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 ; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 ; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 ; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 @@ -890,34 +892,32 @@ ; CHECK-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* -; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* +; CHECK-NEXT: [[TMP9:%.*]] = load <16 x float>, <16 x float>* [[TMP8]], align 4 +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP9]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 -; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP7]], [[RDX_SHUF7]] ; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]] ; CHECK-NEXT: [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0 -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]] -; CHECK-NEXT: [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0 -; CHECK-NEXT: [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] -; CHECK-NEXT: ret float [[TMP12]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[OP_RDX]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast float [[TMP12]], [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast float [[TMP13]], [[TMP3]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd fast float [[TMP14]], [[TMP2]] +; CHECK-NEXT: [[TMP16:%.*]] = fadd fast float [[TMP15]], [[TMP1]] +; CHECK-NEXT: [[TMP17:%.*]] = fadd fast float [[TMP16]], [[TMP0]] +; CHECK-NEXT: ret float [[TMP17]] ; ; THRESHOLD-LABEL: @loadadd31( ; THRESHOLD-NEXT: entry: @@ -926,11 +926,13 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 ; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 +; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 +; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; THRESHOLD-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>* -; THRESHOLD-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 +; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 ; THRESHOLD-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8 ; THRESHOLD-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9 @@ -939,8 +941,8 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12 ; THRESHOLD-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 ; THRESHOLD-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 -; THRESHOLD-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* -; THRESHOLD-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4 +; THRESHOLD-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* +; THRESHOLD-NEXT: [[TMP7:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4 ; THRESHOLD-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 ; THRESHOLD-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 ; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 @@ -957,34 +959,32 @@ ; THRESHOLD-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28 ; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; THRESHOLD-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* -; THRESHOLD-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4 -; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> -; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* +; THRESHOLD-NEXT: [[TMP9:%.*]] = load <16 x float>, <16 x float>* [[TMP8]], align 4 +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> undef, <16 x i32> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP9]], [[RDX_SHUF]] ; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> ; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> ; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; THRESHOLD-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> ; THRESHOLD-NEXT: [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] -; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 -; THRESHOLD-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> -; THRESHOLD-NEXT: [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]] +; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 +; THRESHOLD-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> undef, <8 x i32> +; THRESHOLD-NEXT: [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP7]], [[RDX_SHUF7]] ; THRESHOLD-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> ; THRESHOLD-NEXT: [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]] ; THRESHOLD-NEXT: [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> ; THRESHOLD-NEXT: [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0 -; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP8]], [[TMP9]] -; THRESHOLD-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> -; THRESHOLD-NEXT: [[BIN_RDX14:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF13]] -; THRESHOLD-NEXT: [[RDX_SHUF15:%.*]] = shufflevector <4 x float> [[BIN_RDX14]], <4 x float> undef, <4 x i32> -; THRESHOLD-NEXT: [[BIN_RDX16:%.*]] = fadd fast <4 x float> [[BIN_RDX14]], [[RDX_SHUF15]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX16]], i32 0 -; THRESHOLD-NEXT: [[OP_RDX17:%.*]] = fadd fast float [[OP_RDX]], [[TMP10]] -; THRESHOLD-NEXT: [[TMP11:%.*]] = fadd fast float [[OP_RDX17]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] -; THRESHOLD-NEXT: ret float [[TMP12]] +; THRESHOLD-NEXT: [[TMP11:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0 +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP10]], [[TMP11]] +; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[OP_RDX]], [[TMP5]] +; THRESHOLD-NEXT: [[TMP13:%.*]] = fadd fast float [[TMP12]], [[TMP4]] +; THRESHOLD-NEXT: [[TMP14:%.*]] = fadd fast float [[TMP13]], [[TMP3]] +; THRESHOLD-NEXT: [[TMP15:%.*]] = fadd fast float [[TMP14]], [[TMP2]] +; THRESHOLD-NEXT: [[TMP16:%.*]] = fadd fast float [[TMP15]], [[TMP1]] +; THRESHOLD-NEXT: [[TMP17:%.*]] = fadd fast float [[TMP16]], [[TMP0]] +; THRESHOLD-NEXT: ret float [[TMP17]] ; entry: %arrayidx = getelementptr inbounds float, float* %x, i64 1 Index: llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -54,10 +54,10 @@ define i1 @two_wide_fcmp_reduction(<2 x double> %a0) { ; CHECK-LABEL: @two_wide_fcmp_reduction( ; CHECK-NEXT: [[A:%.*]] = fcmp ogt <2 x double> [[A0:%.*]], -; CHECK-NEXT: [[B:%.*]] = extractelement <2 x i1> [[A]], i32 0 -; CHECK-NEXT: [[C:%.*]] = extractelement <2 x i1> [[A]], i32 1 -; CHECK-NEXT: [[D:%.*]] = and i1 [[B]], [[C]] -; CHECK-NEXT: ret i1 [[D]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[A]], <2 x i1> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i1> [[A]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i1 [[TMP1]] ; %a = fcmp ogt <2 x double> %a0, %b = extractelement <2 x i1> %a, i32 0 @@ -96,18 +96,17 @@ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> undef, double [[MUL]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP8]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[CMP4:%.*]] = fcmp olt double [[TMP9]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP4]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = and <2 x i1> [[TMP8]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: br i1 [[TMP9]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: ; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 -; CHECK-NEXT: [[NOT_OR_COND9:%.*]] = or i1 [[TMP11]], [[TMP12]] -; CHECK-NEXT: ret i1 [[NOT_OR_COND9]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP10]], <2 x i1> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i1> [[TMP10]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i1 [[TMP11]] ; CHECK: cleanup: ; CHECK-NEXT: ret i1 false ; @@ -146,10 +145,10 @@ ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[MUL]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = fdiv <2 x double> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = fcmp uge <2 x double> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 -; CHECK-NEXT: [[NOT_OR_COND:%.*]] = or i1 [[TMP10]], [[TMP11]] -; CHECK-NEXT: ret i1 [[NOT_OR_COND]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i1> [[TMP9]], <2 x i1> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i1> [[TMP9]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i1 [[TMP10]] ; %fneg = fneg double %b %add = fsub double %c, %b