Index: llvm/trunk/include/llvm/Transforms/Vectorize/SLPVectorizer.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ llvm/trunk/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -92,6 +92,12 @@ /// collected in GEPs. bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R); + /// Try to find horizontal reduction or otherwise vectorize a chain of binary + /// operators. + bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, + slpvectorizer::BoUpSLP &R, + TargetTransformInfo *TTI); + /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R); Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3969,36 +3969,40 @@ if (!V) return false; + Value *P = V->getParent(); + + // Vectorize in current basic block only. + auto *Op0 = dyn_cast(V->getOperand(0)); + auto *Op1 = dyn_cast(V->getOperand(1)); + if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P) + return false; + // Try to vectorize V. - if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R)) + if (tryToVectorizePair(Op0, Op1, R)) return true; - BinaryOperator *A = dyn_cast(V->getOperand(0)); - BinaryOperator *B = dyn_cast(V->getOperand(1)); + auto *A = dyn_cast(Op0); + auto *B = dyn_cast(Op1); // Try to skip B. if (B && B->hasOneUse()) { - BinaryOperator *B0 = dyn_cast(B->getOperand(0)); - BinaryOperator *B1 = dyn_cast(B->getOperand(1)); - if (tryToVectorizePair(A, B0, R)) { + auto *B0 = dyn_cast(B->getOperand(0)); + auto *B1 = dyn_cast(B->getOperand(1)); + if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R)) return true; - } - if (tryToVectorizePair(A, B1, R)) { + if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R)) return true; - } } // Try to skip A. if (A && A->hasOneUse()) { - BinaryOperator *A0 = dyn_cast(A->getOperand(0)); - BinaryOperator *A1 = dyn_cast(A->getOperand(1)); - if (tryToVectorizePair(A0, B, R)) { + auto *A0 = dyn_cast(A->getOperand(0)); + auto *A1 = dyn_cast(A->getOperand(1)); + if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R)) return true; - } - if (tryToVectorizePair(A1, B, R)) { + if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R)) return true; - } } - return 0; + return false; } /// \brief Generate a shuffle mask to be used in a reduction tree. @@ -4449,29 +4453,143 @@ return nullptr; } +namespace { +/// Tracks instructons and its children. +class WeakVHWithLevel final : public CallbackVH { + /// Operand index of the instruction currently beeing analized. + unsigned Level = 0; + /// Is this the instruction that should be vectorized, or are we now + /// processing children (i.e. operands of this instruction) for potential + /// vectorization? + bool IsInitial = true; + +public: + explicit WeakVHWithLevel() = default; + WeakVHWithLevel(Value *V) : CallbackVH(V){}; + /// Restart children analysis each time it is repaced by the new instruction. + void allUsesReplacedWith(Value *New) override { + setValPtr(New); + Level = 0; + IsInitial = true; + } + /// Check if the instruction was not deleted during vectorization. + bool isValid() const { return !getValPtr(); } + /// Is the istruction itself must be vectorized? + bool isInitial() const { return IsInitial; } + /// Try to vectorize children. + void clearInitial() { IsInitial = false; } + /// Are all children processed already? + bool isFinal() const { + assert(getValPtr() && + (isa(getValPtr()) && + cast(getValPtr())->getNumOperands() >= Level)); + return getValPtr() && + cast(getValPtr())->getNumOperands() == Level; + } + /// Get next child operation. + Value *nextOperand() { + assert(getValPtr() && isa(getValPtr()) && + cast(getValPtr())->getNumOperands() > Level); + return cast(getValPtr())->getOperand(Level++); + } + virtual ~WeakVHWithLevel() = default; +}; +} // namespace + /// \brief Attempt to reduce a horizontal reduction. /// If it is legal to match a horizontal reduction feeding -/// the phi node P with reduction operators BI, then check if it -/// can be done. +/// the phi node P with reduction operators Root in a basic block BB, then check +/// if it can be done. /// \returns true if a horizontal reduction was matched and reduced. /// \returns false if a horizontal reduction was not matched. -static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI, - BoUpSLP &R, TargetTransformInfo *TTI, - unsigned MinRegSize) { +static bool canBeVectorized( + PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, + TargetTransformInfo *TTI, + const function_ref Vectorize) { if (!ShouldVectorizeHor) return false; - HorizontalReduction HorRdx(MinRegSize); - if (!HorRdx.matchAssociativeReduction(P, BI)) + if (!Root) return false; - // If there is a sufficient number of reduction values, reduce - // to a nearby power-of-2. Can safely generate oversized - // vectors and rely on the backend to split them to legal sizes. - HorRdx.ReduxWidth = - std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); + if (Root->getParent() != BB) + return false; + SmallVector Stack(1, Root); + SmallSet VisitedInstrs; + bool Res = false; + while (!Stack.empty()) { + Value *V = Stack.back(); + if (!V) { + Stack.pop_back(); + continue; + } + auto *Inst = dyn_cast(V); + if (!Inst || isa(Inst)) { + Stack.pop_back(); + continue; + } + if (Stack.back().isInitial()) { + Stack.back().clearInitial(); + if (auto *BI = dyn_cast(Inst)) { + HorizontalReduction HorRdx(R.getMinVecRegSize()); + if (HorRdx.matchAssociativeReduction(P, BI)) { + // If there is a sufficient number of reduction values, reduce + // to a nearby power-of-2. Can safely generate oversized + // vectors and rely on the backend to split them to legal sizes. + HorRdx.ReduxWidth = + std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues())); + + if (HorRdx.tryToReduce(R, TTI)) { + Res = true; + P = nullptr; + continue; + } + } + if (P) { + Inst = dyn_cast(BI->getOperand(0)); + if (Inst == P) + Inst = dyn_cast(BI->getOperand(1)); + if (!Inst) { + P = nullptr; + continue; + } + } + } + P = nullptr; + if (Vectorize(dyn_cast(Inst), R)) { + Res = true; + continue; + } + } + if (Stack.back().isFinal()) { + Stack.pop_back(); + continue; + } + + if (auto *NextV = dyn_cast(Stack.back().nextOperand())) + if (NextV->getParent() == BB && VisitedInstrs.insert(NextV).second && + Stack.size() < RecursionMaxDepth) + Stack.push_back(NextV); + } + return Res; +} + +bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, + BasicBlock *BB, BoUpSLP &R, + TargetTransformInfo *TTI) { + if (!V) + return false; + auto *I = dyn_cast(V); + if (!I) + return false; - return HorRdx.tryToReduce(R, TTI); + if (!isa(I)) + P = nullptr; + // Try to match and vectorize a horizontal reduction. + return canBeVectorized(P, I, BB, R, TTI, + [this](BinaryOperator *BI, BoUpSLP &R) -> bool { + return tryToVectorize(BI, R); + }); } bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { @@ -4541,67 +4659,42 @@ if (P->getNumIncomingValues() != 2) return Changed; - Value *Rdx = getReductionValue(DT, P, BB, LI); - - // Check if this is a Binary Operator. - BinaryOperator *BI = dyn_cast_or_null(Rdx); - if (!BI) - continue; - // Try to match and vectorize a horizontal reduction. - if (canMatchHorizontalReduction(P, BI, R, TTI, R.getMinVecRegSize())) { + if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R, + TTI)) { Changed = true; it = BB->begin(); e = BB->end(); continue; } - - Value *Inst = BI->getOperand(0); - if (Inst == P) - Inst = BI->getOperand(1); - - if (tryToVectorize(dyn_cast(Inst), R)) { - // We would like to start over since some instructions are deleted - // and the iterator may become invalid value. - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } - continue; } - if (ShouldStartVectorizeHorAtStore) - if (StoreInst *SI = dyn_cast(it)) - if (BinaryOperator *BinOp = - dyn_cast(SI->getValueOperand())) { - if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, - R.getMinVecRegSize()) || - tryToVectorize(BinOp, R)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } + if (ShouldStartVectorizeHorAtStore) { + if (StoreInst *SI = dyn_cast(it)) { + // Try to match and vectorize a horizontal reduction. + if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R, + TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; } + } + } // Try to vectorize horizontal reductions feeding into a return. - if (ReturnInst *RI = dyn_cast(it)) - if (RI->getNumOperands() != 0) - if (BinaryOperator *BinOp = - dyn_cast(RI->getOperand(0))) { - DEBUG(dbgs() << "SLP: Found a return to vectorize.\n"); - if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, - R.getMinVecRegSize()) || - tryToVectorizePair(BinOp->getOperand(0), BinOp->getOperand(1), - R)) { - Changed = true; - it = BB->begin(); - e = BB->end(); - continue; - } + if (ReturnInst *RI = dyn_cast(it)) { + if (RI->getNumOperands() != 0) { + // Try to match and vectorize a horizontal reduction. + if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) { + Changed = true; + it = BB->begin(); + e = BB->end(); + continue; } + } + } // Try to vectorize trees that start at compare instructions. if (CmpInst *CI = dyn_cast(it)) { @@ -4614,16 +4707,14 @@ continue; } - for (int i = 0; i < 2; ++i) { - if (BinaryOperator *BI = dyn_cast(CI->getOperand(i))) { - if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) { - Changed = true; - // We would like to start over since some instructions are deleted - // and the iterator may become invalid value. - it = BB->begin(); - e = BB->end(); - break; - } + for (int I = 0; I < 2; ++I) { + if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) { + Changed = true; + // We would like to start over since some instructions are deleted + // and the iterator may become invalid value. + it = BB->begin(); + e = BB->end(); + break; } } continue; Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -11,26 +11,25 @@ ; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16 -; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4 -; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]] -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <2 x float> [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP8]], [[ADD_1]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP7]], i32 1 -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP9]], [[ADD_2]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]] +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] ; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]] -; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL4]], [[ADD7]] -; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD19]] -; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP8]], [[ADD19_1]] -; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP9]], [[ADD19_2]] +; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]] +; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]] +; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]] +; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]] ; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4 ; CHECK-NEXT: ret float [[ADD19_3]] ; @@ -68,40 +67,37 @@ ; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16 -; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4 -; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8 -; CHECK-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4 -; CHECK-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]] +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] ; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 ; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float ; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4), align 16 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4), align 16 -; CHECK-NEXT: [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 5), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 5), align 4 -; CHECK-NEXT: [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]] -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x float> [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 -; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP16]], [[ADD19_1]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 -; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP17]], [[ADD19_2]] +; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4) to <2 x float>*), align 16 +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4) to <2 x float>*), align 16 +; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP12]], [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP14]], [[ADD7]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP15]], [[ADD19]] +; CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8 +; CHECK-NEXT: [[TMP17:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8 +; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <2 x float> [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP18]], i32 0 +; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP19]], [[ADD19_1]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP18]], i32 1 +; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP20]], [[ADD19_2]] ; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4 ; CHECK-NEXT: ret float [[ADD19_3]] ; @@ -152,24 +148,20 @@ ; CHECK-LABEL: @bazzz( ; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16 -; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float [[MUL_1]], [[MUL]] -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8 -; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4 -; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[MUL_3]], [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[CONV]], [[TMP11]] -; CHECK-NEXT: store float [[TMP12]], float* @res, align 4 -; CHECK-NEXT: ret float [[TMP12]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float undef, undef +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float undef, [[TMP4]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast float undef, [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]] +; CHECK-NEXT: store float [[TMP8]], float* @res, align 4 +; CHECK-NEXT: ret float [[TMP8]] ; entry: %0 = load i32, i32* @n, align 4 @@ -198,23 +190,19 @@ ; CHECK-LABEL: @foo( ; CHECK: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16 -; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float [[MUL_1]], [[MUL]] -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8 -; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP5]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4 -; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast float [[MUL_3]], [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[CONV]], [[TMP11]] -; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP12]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float undef, undef +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast float undef, [[TMP4]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast float undef, [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]] +; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP8]] to i32 ; CHECK-NEXT: store i32 [[CONV4]], i32* @n, align 4 ; CHECK-NEXT: ret i32 [[CONV4]] ; @@ -244,8 +232,7 @@ define float @bar() { ; CHECK-LABEL: @bar( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 +; CHECK: [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 ; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0