Skip to content

Commit a16cfe6

Browse files
committedFeb 3, 2017
[SLP] Fix for PR31690: Allow using of extra values in horizontal reductions.
Currently LLVM supports vectorization of horizontal reduction instructions with initial value set to 0. Patch supports vectorization of reduction with non-zero initial values. Also it supports a vectorization of instructions with some extra arguments, like: float f(float x[], int a, int b) { float p = a % b; p += x[0] + 3; for (int i = 1; i < 32; i++) p += x[i]; return p; } Patch allows vectorization of this kind of horizontal reductions. Differential Revision: https://reviews.llvm.org/D28961 llvm-svn: 293994
1 parent 1380edf commit a16cfe6

File tree

2 files changed

+181
-146
lines changed

2 files changed

+181
-146
lines changed
 

‎llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+67-12
Original file line numberDiff line numberDiff line change
@@ -4186,6 +4186,8 @@ namespace {
41864186
class HorizontalReduction {
41874187
SmallVector<Value *, 16> ReductionOps;
41884188
SmallVector<Value *, 32> ReducedVals;
4189+
// Use map vector to make stable output.
4190+
MapVector<Value *, Value *> ExtraArgs;
41894191

41904192
BinaryOperator *ReductionRoot = nullptr;
41914193
// After successfull horizontal reduction vectorization attempt for PHI node
@@ -4205,6 +4207,26 @@ class HorizontalReduction {
42054207
/// splits the vector in halves and adds those halves.
42064208
bool IsPairwiseReduction = false;
42074209

4210+
/// Checks if the ParentStackElem.first should be marked as a reduction
4211+
/// operation with an extra argument or as extra argument itself.
4212+
void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
4213+
Value *ExtraArg) {
4214+
if (ExtraArgs.count(ParentStackElem.first)) {
4215+
ExtraArgs[ParentStackElem.first] = nullptr;
4216+
// We ran into something like:
4217+
// ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
4218+
// The whole ParentStackElem.first should be considered as an extra value
4219+
// in this case.
4220+
// Do not perform analysis of remaining operands of ParentStackElem.first
4221+
// instruction, this whole instruction is an extra argument.
4222+
ParentStackElem.second = ParentStackElem.first->getNumOperands();
4223+
} else {
4224+
// We ran into something like:
4225+
// ParentStackElem.first += ... + ExtraArg + ...
4226+
ExtraArgs[ParentStackElem.first] = ExtraArg;
4227+
}
4228+
}
4229+
42084230
public:
42094231
HorizontalReduction() = default;
42104232

@@ -4257,8 +4279,23 @@ class HorizontalReduction {
42574279
if (EdgeToVist == 2 || IsReducedValue) {
42584280
if (IsReducedValue)
42594281
ReducedVals.push_back(TreeN);
4260-
else
4261-
ReductionOps.push_back(TreeN);
4282+
else {
4283+
auto I = ExtraArgs.find(TreeN);
4284+
if (I != ExtraArgs.end() && !I->second) {
4285+
// Check if TreeN is an extra argument of its parent operation.
4286+
if (Stack.size() <= 1) {
4287+
// TreeN can't be an extra argument as it is a root reduction
4288+
// operation.
4289+
return false;
4290+
}
4291+
// Yes, TreeN is an extra argument, do not add it to a list of
4292+
// reduction operations.
4293+
// Stack[Stack.size() - 2] always points to the parent operation.
4294+
markExtraArg(Stack[Stack.size() - 2], TreeN);
4295+
ExtraArgs.erase(TreeN);
4296+
} else
4297+
ReductionOps.push_back(TreeN);
4298+
}
42624299
// Retract.
42634300
Stack.pop_back();
42644301
continue;
@@ -4275,30 +4312,42 @@ class HorizontalReduction {
42754312
if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
42764313
I->getOpcode() == ReductionOpcode)) {
42774314
// Only handle trees in the current basic block.
4278-
if (I->getParent() != B->getParent())
4279-
return false;
4315+
if (I->getParent() != B->getParent()) {
4316+
// I is an extra argument for TreeN (its parent operation).
4317+
markExtraArg(Stack.back(), I);
4318+
continue;
4319+
}
42804320

42814321
// Each tree node needs to have one user except for the ultimate
42824322
// reduction.
4283-
if (!I->hasOneUse() && I != B)
4284-
return false;
4323+
if (!I->hasOneUse() && I != B) {
4324+
// I is an extra argument for TreeN (its parent operation).
4325+
markExtraArg(Stack.back(), I);
4326+
continue;
4327+
}
42854328

42864329
if (I->getOpcode() == ReductionOpcode) {
42874330
// We need to be able to reassociate the reduction operations.
4288-
if (!I->isAssociative())
4289-
return false;
4331+
if (!I->isAssociative()) {
4332+
// I is an extra argument for TreeN (its parent operation).
4333+
markExtraArg(Stack.back(), I);
4334+
continue;
4335+
}
42904336
} else if (ReducedValueOpcode &&
42914337
ReducedValueOpcode != I->getOpcode()) {
42924338
// Make sure that the opcodes of the operations that we are going to
42934339
// reduce match.
4294-
return false;
4340+
// I is an extra argument for TreeN (its parent operation).
4341+
markExtraArg(Stack.back(), I);
4342+
continue;
42954343
} else if (!ReducedValueOpcode)
42964344
ReducedValueOpcode = I->getOpcode();
42974345

42984346
Stack.push_back(std::make_pair(I, 0));
42994347
continue;
43004348
}
4301-
return false;
4349+
// NextV is an extra argument for TreeN (its parent operation).
4350+
markExtraArg(Stack.back(), NextV);
43024351
}
43034352
}
43044353
return true;
@@ -4367,10 +4416,16 @@ class HorizontalReduction {
43674416
if (VectorizedTree) {
43684417
// Finish the reduction.
43694418
for (; i < NumReducedVals; ++i) {
4419+
auto *I = cast<Instruction>(ReducedVals[i]);
4420+
Builder.SetCurrentDebugLocation(I->getDebugLoc());
4421+
VectorizedTree =
4422+
Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
4423+
}
4424+
for (auto &Pair : ExtraArgs) {
43704425
Builder.SetCurrentDebugLocation(
4371-
cast<Instruction>(ReducedVals[i])->getDebugLoc());
4426+
cast<Instruction>(Pair.first)->getDebugLoc());
43724427
VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
4373-
ReducedVals[i]);
4428+
Pair.second, "bin.extra");
43744429
}
43754430
// Update users.
43764431
if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {

‎llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll

+114-134
Original file line numberDiff line numberDiff line change
@@ -69,39 +69,31 @@ define float @bazz() {
6969
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4
7070
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
7171
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
72-
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
73-
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
74-
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
75-
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
76-
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
77-
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
78-
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
79-
; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
80-
; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
81-
; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
82-
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
83-
; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
84-
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
85-
; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
72+
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
73+
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
74+
; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
75+
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float undef, [[CONV]]
76+
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
77+
; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
78+
; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
8679
; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
8780
; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
8881
; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
89-
; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4) to <2 x float>*), align 16
90-
; CHECK-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4) to <2 x float>*), align 16
91-
; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP12]], [[TMP11]]
92-
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
93-
; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP14]], [[ADD7]]
94-
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
95-
; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP15]], [[ADD19]]
96-
; CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8
97-
; CHECK-NEXT: [[TMP17:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8
98-
; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <2 x float> [[TMP17]], [[TMP16]]
99-
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP18]], i32 0
100-
; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP19]], [[ADD19_1]]
101-
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP18]], i32 1
102-
; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP20]], [[ADD19_2]]
103-
; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4
104-
; CHECK-NEXT: ret float [[ADD19_3]]
82+
; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
83+
; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
84+
; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
85+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
86+
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
87+
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
88+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
89+
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
90+
; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
91+
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
92+
; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
93+
; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]]
94+
; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
95+
; CHECK-NEXT: store float [[BIN_EXTRA5]], float* @res, align 4
96+
; CHECK-NEXT: ret float [[BIN_EXTRA5]]
10597
;
10698
entry:
10799
%0 = load i32, i32* @n, align 4
@@ -555,102 +547,84 @@ define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
555547
; CHECK-NEXT: entry:
556548
; CHECK-NEXT: [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
557549
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to float
558-
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
559-
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], [[CONV]]
560-
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
561-
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
562-
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[ADD]]
550+
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
563551
; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
564-
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4
565-
; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]]
566552
; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
567-
; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4
568-
; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]]
569553
; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
570-
; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4
571-
; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP4]], [[ADD_3]]
572554
; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
573-
; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4
574-
; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP5]], [[ADD_4]]
575555
; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
576-
; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_6]], align 4
577-
; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP6]], [[ADD_5]]
578556
; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
579-
; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_7]], align 4
580-
; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float [[TMP7]], [[ADD_6]]
581557
; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
582-
; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX_8]], align 4
583-
; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float [[TMP8]], [[ADD_7]]
584558
; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
585-
; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX_9]], align 4
586-
; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float [[TMP9]], [[ADD_8]]
587559
; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
588-
; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX_10]], align 4
589-
; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float [[TMP10]], [[ADD_9]]
590560
; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
591-
; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX_11]], align 4
592-
; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float [[TMP11]], [[ADD_10]]
593561
; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
594-
; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX_12]], align 4
595-
; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float [[TMP12]], [[ADD_11]]
596562
; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
597-
; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX_13]], align 4
598-
; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float [[TMP13]], [[ADD_12]]
599563
; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
600-
; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX_14]], align 4
601-
; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float [[TMP14]], [[ADD_13]]
602564
; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
603-
; CHECK-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX_15]], align 4
604-
; CHECK-NEXT: [[ADD_15:%.*]] = fadd fast float [[TMP15]], [[ADD_14]]
605565
; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
606-
; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[ARRAYIDX_16]], align 4
607-
; CHECK-NEXT: [[ADD_16:%.*]] = fadd fast float [[TMP16]], [[ADD_15]]
608566
; CHECK-NEXT: [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
609-
; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX_17]], align 4
610-
; CHECK-NEXT: [[ADD_17:%.*]] = fadd fast float [[TMP17]], [[ADD_16]]
611567
; CHECK-NEXT: [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
612-
; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX_18]], align 4
613-
; CHECK-NEXT: [[ADD_18:%.*]] = fadd fast float [[TMP18]], [[ADD_17]]
614568
; CHECK-NEXT: [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
615-
; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX_19]], align 4
616-
; CHECK-NEXT: [[ADD_19:%.*]] = fadd fast float [[TMP19]], [[ADD_18]]
617569
; CHECK-NEXT: [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
618-
; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[ARRAYIDX_20]], align 4
619-
; CHECK-NEXT: [[ADD_20:%.*]] = fadd fast float [[TMP20]], [[ADD_19]]
620570
; CHECK-NEXT: [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
621-
; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX_21]], align 4
622-
; CHECK-NEXT: [[ADD_21:%.*]] = fadd fast float [[TMP21]], [[ADD_20]]
623571
; CHECK-NEXT: [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
624-
; CHECK-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX_22]], align 4
625-
; CHECK-NEXT: [[ADD_22:%.*]] = fadd fast float [[TMP22]], [[ADD_21]]
626572
; CHECK-NEXT: [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
627-
; CHECK-NEXT: [[TMP23:%.*]] = load float, float* [[ARRAYIDX_23]], align 4
628-
; CHECK-NEXT: [[ADD_23:%.*]] = fadd fast float [[TMP23]], [[ADD_22]]
629573
; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
630-
; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX_24]], align 4
631-
; CHECK-NEXT: [[ADD_24:%.*]] = fadd fast float [[TMP24]], [[ADD_23]]
632574
; CHECK-NEXT: [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
633-
; CHECK-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX_25]], align 4
634-
; CHECK-NEXT: [[ADD_25:%.*]] = fadd fast float [[TMP25]], [[ADD_24]]
635575
; CHECK-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
636-
; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX_26]], align 4
637-
; CHECK-NEXT: [[ADD_26:%.*]] = fadd fast float [[TMP26]], [[ADD_25]]
638576
; CHECK-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
639-
; CHECK-NEXT: [[TMP27:%.*]] = load float, float* [[ARRAYIDX_27]], align 4
640-
; CHECK-NEXT: [[ADD_27:%.*]] = fadd fast float [[TMP27]], [[ADD_26]]
641577
; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
642-
; CHECK-NEXT: [[TMP28:%.*]] = load float, float* [[ARRAYIDX_28]], align 4
643-
; CHECK-NEXT: [[ADD_28:%.*]] = fadd fast float [[TMP28]], [[ADD_27]]
644578
; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
645-
; CHECK-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX_29]], align 4
646-
; CHECK-NEXT: [[ADD_29:%.*]] = fadd fast float [[TMP29]], [[ADD_28]]
647579
; CHECK-NEXT: [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
648-
; CHECK-NEXT: [[TMP30:%.*]] = load float, float* [[ARRAYIDX_30]], align 4
649-
; CHECK-NEXT: [[ADD_30:%.*]] = fadd fast float [[TMP30]], [[ADD_29]]
650580
; CHECK-NEXT: [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
651-
; CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX_31]], align 4
652-
; CHECK-NEXT: [[ADD_31:%.*]] = fadd fast float [[TMP31]], [[ADD_30]]
653-
; CHECK-NEXT: ret float [[ADD_31]]
581+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
582+
; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
583+
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float undef, [[CONV]]
584+
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
585+
; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
586+
; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
587+
; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
588+
; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
589+
; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
590+
; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
591+
; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
592+
; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
593+
; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
594+
; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
595+
; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
596+
; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
597+
; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
598+
; CHECK-NEXT: [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
599+
; CHECK-NEXT: [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
600+
; CHECK-NEXT: [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
601+
; CHECK-NEXT: [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
602+
; CHECK-NEXT: [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
603+
; CHECK-NEXT: [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
604+
; CHECK-NEXT: [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
605+
; CHECK-NEXT: [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
606+
; CHECK-NEXT: [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
607+
; CHECK-NEXT: [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
608+
; CHECK-NEXT: [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
609+
; CHECK-NEXT: [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
610+
; CHECK-NEXT: [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
611+
; CHECK-NEXT: [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
612+
; CHECK-NEXT: [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
613+
; CHECK-NEXT: [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
614+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
615+
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
616+
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
617+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
618+
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
619+
; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
620+
; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
621+
; CHECK-NEXT: [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
622+
; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
623+
; CHECK-NEXT: [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
624+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
625+
; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
626+
; CHECK-NEXT: [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
627+
; CHECK-NEXT: ret float [[BIN_EXTRA]]
654628
;
655629
entry:
656630
%rem = srem i32 %a, %b
@@ -948,32 +922,35 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
948922
; CHECK-NEXT: entry:
949923
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
950924
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
951-
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
952925
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
953-
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]]
954-
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
955-
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]], align 4
956-
; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]]
957-
; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
926+
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
958927
; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
959-
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
960-
; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD5]]
961928
; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
962-
; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
963-
; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]]
964929
; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
965-
; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
966-
; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]]
967930
; CHECK-NEXT: [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
968-
; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]], align 4
969-
; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD4_3]]
970931
; CHECK-NEXT: [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
971-
; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]], align 4
972-
; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]]
973932
; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
974-
; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]], align 4
975-
; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]]
976-
; CHECK-NEXT: ret float [[ADD4_6]]
933+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
934+
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
935+
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
936+
; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
937+
; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
938+
; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
939+
; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
940+
; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
941+
; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
942+
; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
943+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
944+
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
945+
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
946+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
947+
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
948+
; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
949+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
950+
; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
951+
; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
952+
; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
953+
; CHECK-NEXT: ret float [[BIN_EXTRA5]]
977954
;
978955
entry:
979956
%mul = mul nsw i32 %b, %a
@@ -1011,34 +988,37 @@ define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b
1011988
; CHECK-NEXT: entry:
1012989
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
1013990
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
1014-
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
1015991
; CHECK-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
1016992
; CHECK-NEXT: [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
1017993
; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
1018-
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]]
1019-
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
1020-
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]], align 4
1021-
; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]]
994+
; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
1022995
; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
1023-
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
1024-
; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD4]]
1025996
; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
1026-
; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
1027-
; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]]
1028997
; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
1029-
; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
1030-
; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]]
1031-
; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
1032998
; CHECK-NEXT: [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
1033-
; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]], align 4
1034-
; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD5]]
1035999
; CHECK-NEXT: [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
1036-
; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]], align 4
1037-
; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]]
10381000
; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
1039-
; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]], align 4
1040-
; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]]
1041-
; CHECK-NEXT: ret float [[ADD4_6]]
1001+
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
1002+
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
1003+
; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
1004+
; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
1005+
; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
1006+
; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
1007+
; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
1008+
; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
1009+
; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
1010+
; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
1011+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
1012+
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
1013+
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1014+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
1015+
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1016+
; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
1017+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
1018+
; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
1019+
; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
1020+
; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
1021+
; CHECK-NEXT: ret float [[BIN_EXTRA5]]
10421022
;
10431023
entry:
10441024
%mul = mul nsw i32 %b, %a

0 commit comments

Comments
 (0)
Please sign in to comment.