Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4262,7 +4262,7 @@ Builder.setFastMathFlags(Unsafe); unsigned i = 0; - for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { + while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ReductionOps); if (V.shouldReorder()) { @@ -4270,7 +4270,7 @@ V.buildTree(Reversed, ReductionOps); } if (V.isTreeTinyAndNotFullyVectorizable()) - continue; + break; V.computeMinimumValueSizes(); @@ -4296,6 +4296,8 @@ ReducedSubTree, "bin.rdx"); } else VectorizedTree = ReducedSubTree; + i += ReduxWidth; + ReduxWidth = PowerOf2Floor(NumReducedVals - i); } if (VectorizedTree) { Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -283,52 +283,38 @@ define float @f(float* nocapture readonly %x) { ; CHECK-LABEL: @f( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4 -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]] ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4 -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]] ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4 -; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP4]], [[ADD_3]] ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4 -; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP5]], [[ADD_4]] ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_6]], align 4 -; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP6]], [[ADD_5]] ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_7]], align 4 -; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float [[TMP7]], [[ADD_6]] ; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX_8]], align 4 -; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float [[TMP8]], [[ADD_7]] ; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9 -; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX_9]], align 4 -; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float [[TMP9]], [[ADD_8]] ; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX_10]], align 4 -; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float [[TMP10]], [[ADD_9]] ; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11 -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX_11]], align 4 -; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float [[TMP11]], [[ADD_10]] ; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12 -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX_12]], align 4 -; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float [[TMP12]], [[ADD_11]] ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 -; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX_13]], align 4 -; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float [[TMP13]], [[ADD_12]] ; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 -; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX_14]], align 4 -; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float [[TMP14]], [[ADD_13]] ; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 -; CHECK-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX_15]], align 4 -; CHECK-NEXT: [[ADD_15:%.*]] = fadd fast float [[TMP15]], [[ADD_14]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float undef, undef +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] +; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]] +; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]] +; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]] +; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]] +; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]] +; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]] +; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]] +; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]] +; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]] +; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]] +; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]] +; CHECK-NEXT: [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]] ; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 ; CHECK-NEXT: [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 ; CHECK-NEXT: [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18 @@ -361,8 +347,8 @@ ; CHECK-NEXT: [[ARRAYIDX_45:%.*]] = getelementptr inbounds float, float* [[X]], i64 45 ; CHECK-NEXT: [[ARRAYIDX_46:%.*]] = getelementptr inbounds float, float* [[X]], i64 46 ; CHECK-NEXT: [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47 -; CHECK-NEXT: [[TMP16:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>* -; CHECK-NEXT: [[TMP17:%.*]] = load <32 x float>, <32 x float>* [[TMP16]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]] ; CHECK-NEXT: [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]] ; CHECK-NEXT: [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]] @@ -394,8 +380,8 @@ ; CHECK-NEXT: [[ADD_44:%.*]] = fadd fast float undef, [[ADD_43]] ; CHECK-NEXT: [[ADD_45:%.*]] = fadd fast float undef, [[ADD_44]] ; CHECK-NEXT: [[ADD_46:%.*]] = fadd fast float undef, [[ADD_45]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP17]], <32 x float> undef, <32 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP17]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP3]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> @@ -404,25 +390,19 @@ ; CHECK-NEXT: [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]] ; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> ; CHECK-NEXT: [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = fadd fast float [[TMP18]], [[TMP15]] -; CHECK-NEXT: [[TMP20:%.*]] = fadd fast float [[TMP19]], [[TMP14]] -; CHECK-NEXT: [[TMP21:%.*]] = fadd fast float [[TMP20]], [[TMP13]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd fast float [[TMP21]], [[TMP12]] -; CHECK-NEXT: [[TMP23:%.*]] = fadd fast float [[TMP22]], [[TMP11]] -; CHECK-NEXT: [[TMP24:%.*]] = fadd fast float [[TMP23]], [[TMP10]] -; CHECK-NEXT: [[TMP25:%.*]] = fadd fast float [[TMP24]], [[TMP9]] -; CHECK-NEXT: [[TMP26:%.*]] = fadd fast float [[TMP25]], [[TMP8]] -; CHECK-NEXT: [[TMP27:%.*]] = fadd fast float [[TMP26]], [[TMP7]] -; CHECK-NEXT: [[TMP28:%.*]] = fadd fast float [[TMP27]], [[TMP6]] -; CHECK-NEXT: [[TMP29:%.*]] = fadd fast float [[TMP28]], [[TMP5]] -; CHECK-NEXT: [[TMP30:%.*]] = fadd fast float [[TMP29]], [[TMP4]] -; CHECK-NEXT: [[TMP31:%.*]] = fadd fast float [[TMP30]], [[TMP3]] -; CHECK-NEXT: [[TMP32:%.*]] = fadd fast float [[TMP31]], [[TMP2]] -; CHECK-NEXT: [[TMP33:%.*]] = fadd fast float [[TMP32]], [[TMP1]] -; CHECK-NEXT: [[TMP34:%.*]] = fadd fast float [[TMP33]], [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 +; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX10:%.*]] = fadd fast <16 x float> [[TMP1]], [[RDX_SHUF9]] +; CHECK-NEXT: [[RDX_SHUF11:%.*]] = shufflevector <16 x float> [[BIN_RDX10]], <16 x float> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX12:%.*]] = fadd fast <16 x float> [[BIN_RDX10]], [[RDX_SHUF11]] +; CHECK-NEXT: [[RDX_SHUF13:%.*]] = shufflevector <16 x float> [[BIN_RDX12]], <16 x float> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX14:%.*]] = fadd fast <16 x float> [[BIN_RDX12]], [[RDX_SHUF13]] +; CHECK-NEXT: [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0 +; CHECK-NEXT: [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]] -; CHECK-NEXT: ret float [[TMP34]] +; CHECK-NEXT: ret float [[BIN_RDX17]] ; entry: %0 = load float, float* %x, align 4 @@ -782,41 +762,33 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 ; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4 -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]] ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4 -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]] ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4 -; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP4]], [[ADD_3]] ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4 -; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP5]], [[ADD_4]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] +; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]] +; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]] ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_6]], align 4 -; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP6]], [[ADD_5]] ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_7]], align 4 -; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float [[TMP7]], [[ADD_6]] ; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX_8]], align 4 -; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float [[TMP8]], [[ADD_7]] ; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 10 -; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX_9]], align 4 -; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float [[TMP9]], [[ADD_8]] ; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 11 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX_10]], align 4 -; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float [[TMP10]], [[ADD_9]] ; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12 -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX_11]], align 4 -; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float [[TMP11]], [[ADD_10]] ; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX_12]], align 4 -; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float [[TMP12]], [[ADD_11]] ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 -; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX_13]], align 4 -; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float [[TMP13]], [[ADD_12]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]] +; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]] +; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]] +; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]] +; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]] +; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]] +; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]] +; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]] ; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 ; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 ; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 @@ -833,8 +805,8 @@ ; CHECK-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28 ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* -; CHECK-NEXT: [[TMP15:%.*]] = load <16 x float>, <16 x float>* [[TMP14]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4 ; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]] ; CHECK-NEXT: [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]] ; CHECK-NEXT: [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]] @@ -850,31 +822,33 @@ ; CHECK-NEXT: [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]] ; CHECK-NEXT: [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]] ; CHECK-NEXT: [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> undef, <16 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP15]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]] ; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = fadd fast float [[TMP16]], [[TMP13]] -; CHECK-NEXT: [[TMP18:%.*]] = fadd fast float [[TMP17]], [[TMP12]] -; CHECK-NEXT: [[TMP19:%.*]] = fadd fast float [[TMP18]], [[TMP11]] -; CHECK-NEXT: [[TMP20:%.*]] = fadd fast float [[TMP19]], [[TMP10]] -; CHECK-NEXT: [[TMP21:%.*]] = fadd fast float [[TMP20]], [[TMP9]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd fast float [[TMP21]], [[TMP8]] -; CHECK-NEXT: [[TMP23:%.*]] = fadd fast float [[TMP22]], [[TMP7]] -; CHECK-NEXT: [[TMP24:%.*]] = fadd fast float [[TMP23]], [[TMP6]] -; CHECK-NEXT: [[TMP25:%.*]] = fadd fast float [[TMP24]], [[TMP5]] -; CHECK-NEXT: [[TMP26:%.*]] = fadd fast float [[TMP25]], [[TMP4]] -; CHECK-NEXT: [[TMP27:%.*]] = fadd fast float [[TMP26]], [[TMP3]] -; CHECK-NEXT: [[TMP28:%.*]] = fadd fast float [[TMP27]], [[TMP2]] -; CHECK-NEXT: [[TMP29:%.*]] = fadd fast float [[TMP28]], [[TMP1]] -; CHECK-NEXT: [[TMP30:%.*]] = fadd fast float [[TMP29]], [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0 +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]] +; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]] +; CHECK-NEXT: [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0 +; CHECK-NEXT: [[BIN_RDX13:%.*]] = fadd fast float [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[RDX_SHUF14:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX15:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF14]] +; CHECK-NEXT: [[RDX_SHUF16:%.*]] = shufflevector <4 x float> [[BIN_RDX15]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX17:%.*]] = fadd fast <4 x float> [[BIN_RDX15]], [[RDX_SHUF16]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX17]], i32 0 +; CHECK-NEXT: [[BIN_RDX18:%.*]] = fadd fast float [[BIN_RDX13]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[BIN_RDX18]], [[TMP1]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]] ; CHECK-NEXT: [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]] -; CHECK-NEXT: ret float [[TMP30]] +; CHECK-NEXT: ret float [[TMP12]] ; entry: %arrayidx = getelementptr inbounds float, float* %x, i64 1