Index: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -330,6 +330,10 @@ /// \brief Vectorize the tree that starts with the elements in \p VL. /// Returns the vectorized root. Value *vectorizeTree(); + /// Vectorize the tree but with the list of externally used values \p + /// ExternallyUsedValues. Values in this MapVector can be replaced but the + /// generated extractvalue instructions. + Value *vectorizeTree(MapVector &ExternallyUsedValues); /// \returns the cost incurred by unwanted spills and fills, caused by /// holding live values over call sites. @@ -343,6 +347,13 @@ /// the purpose of scheduling and extraction in the \p UserIgnoreLst. void buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst = None); + /// Construct a vectorizable tree that starts at \p Roots, ignoring users for + /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking + /// into account (anf updating it, if required) list of externally used + /// values stored in \p ExternallyUsedValues. + void buildTree(ArrayRef Roots, + MapVector &ExternallyUsedValues, + ArrayRef UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { @@ -576,7 +587,9 @@ SmallVector, 8> DeletedInstructions; /// A list of values that need to extracted out of the tree. - /// This list holds pairs of (Internal Scalar : External User). + /// This list holds pairs of (Internal Scalar : External User). External User + /// can be nullptr, it means that this Internal Scalar will be used later, + /// after vectorization. UserList ExternalUses; /// Values used only by @llvm.assume calls. @@ -940,6 +953,12 @@ void BoUpSLP::buildTree(ArrayRef Roots, ArrayRef UserIgnoreLst) { + MapVector ExternallyUsedValues; + buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); +} +void BoUpSLP::buildTree(ArrayRef Roots, + MapVector &ExternallyUsedValues, + ArrayRef UserIgnoreLst) { deleteTree(); UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) @@ -958,6 +977,14 @@ if (Entry->NeedToGather) continue; + // Check if the scalar is externally used as an extra arg. + auto ExtI = ExternallyUsedValues.find(Scalar); + if (ExtI != ExternallyUsedValues.end()) { + DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " << + Lane << " from " << *Scalar << ".\n"); + ExternalUses.emplace_back(Scalar, nullptr, Lane); + continue; + } for (User *U : Scalar->users()) { DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); @@ -2768,6 +2795,12 @@ } Value *BoUpSLP::vectorizeTree() { + MapVector ExternallyUsedValues; + return vectorizeTree(ExternallyUsedValues); +} + +Value * +BoUpSLP::vectorizeTree(MapVector &ExternallyUsedValues) { // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { @@ -2810,7 +2843,7 @@ // Skip users that we already RAUW. This happens when one instruction // has multiple uses of the same value. - if (!is_contained(Scalar->users(), User)) + if (User && !is_contained(Scalar->users(), User)) continue; assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); @@ -2822,6 +2855,28 @@ assert(Vec && "Can't find vectorizable value"); Value *Lane = Builder.getInt32(ExternalUse.Lane); + // If User == nullptr, the Scalar is used as extra arg. Generate + // ExtractElement instruction and update the record for this scalar in + // ExternallyUsedValues. + if (!User) { + assert(ExternallyUsedValues.count(Scalar) && + "Scalar with nullptr as an external user must be registered in " + "ExternallyUsedValues map"); + DebugLoc DL = ExternallyUsedValues[Scalar]; + if (auto *VecI = dyn_cast(Vec)) { + Builder.SetInsertPoint(VecI->getParent(), + std::next(VecI->getIterator())); + } else { + Builder.SetInsertPoint(&F->getEntryBlock().front()); + } + Value *Ex = Builder.CreateExtractElement(Vec, Lane); + Ex = extend(ScalarRoot, Ex, Scalar->getType()); + CSEBlocks.insert(cast(Scalar)->getParent()); + ExternallyUsedValues.erase(Scalar); + ExternallyUsedValues[Ex] = DL; + continue; + } + // Generate extracts for out-of-tree users. // Find the insertion point for the extractelement lane. if (auto *VecI = dyn_cast(Vec)) { @@ -4189,6 +4244,8 @@ class HorizontalReduction { SmallVector ReductionOps; SmallVector ReducedVals; + // Use map vector to make stable output. + MapVector ExtraArgs; BinaryOperator *ReductionRoot = nullptr; // After successfull horizontal reduction vectorization attempt for PHI node @@ -4208,6 +4265,26 @@ /// splits the vector in halves and adds those halves. bool IsPairwiseReduction = false; + /// Checks if the ParentStackElem.first should be marked as a reduction + /// operation with an extra argument or as extra argument itself. + void markExtraArg(std::pair &ParentStackElem, + Value *ExtraArg) { + if (ExtraArgs.count(ParentStackElem.first)) { + ExtraArgs[ParentStackElem.first] = nullptr; + // We ran into something like: + // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg. + // The whole ParentStackElem.first should be considered as an extra value + // in this case. + // Do not perform analysis of remaining operands of ParentStackElem.first + // instruction, this whole instruction is an extra argument. + ParentStackElem.second = ParentStackElem.first->getNumOperands(); + } else { + // We ran into something like: + // ParentStackElem.first += ... + ExtraArg + ... + ExtraArgs[ParentStackElem.first] = ExtraArg; + } + } + public: HorizontalReduction() = default; @@ -4260,8 +4337,23 @@ if (EdgeToVist == 2 || IsReducedValue) { if (IsReducedValue) ReducedVals.push_back(TreeN); - else - ReductionOps.push_back(TreeN); + else { + auto I = ExtraArgs.find(TreeN); + if (I != ExtraArgs.end() && !I->second) { + // Check if TreeN is an extra argument of its parent operation. + if (Stack.size() <= 1) { + // TreeN can't be an extra argument as it is a root reduction + // operation. + return false; + } + // Yes, TreeN is an extra argument, do not add it to a list of + // reduction operations. + // Stack[Stack.size() - 2] always points to the parent operation. + markExtraArg(Stack[Stack.size() - 2], TreeN); + ExtraArgs.erase(TreeN); + } else + ReductionOps.push_back(TreeN); + } // Retract. Stack.pop_back(); continue; @@ -4278,30 +4370,42 @@ if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode || I->getOpcode() == ReductionOpcode)) { // Only handle trees in the current basic block. - if (I->getParent() != B->getParent()) - return false; + if (I->getParent() != B->getParent()) { + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } // Each tree node needs to have one user except for the ultimate // reduction. - if (!I->hasOneUse() && I != B) - return false; + if (!I->hasOneUse() && I != B) { + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } if (I->getOpcode() == ReductionOpcode) { // We need to be able to reassociate the reduction operations. - if (!I->isAssociative()) - return false; + if (!I->isAssociative()) { + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; + } } else if (ReducedValueOpcode && ReducedValueOpcode != I->getOpcode()) { // Make sure that the opcodes of the operations that we are going to // reduce match. - return false; + // I is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), I); + continue; } else if (!ReducedValueOpcode) ReducedValueOpcode = I->getOpcode(); Stack.push_back(std::make_pair(I, 0)); continue; } - return false; + // NextV is an extra argument for TreeN (its parent operation). + markExtraArg(Stack.back(), NextV); } } return true; @@ -4329,12 +4433,15 @@ Builder.setFastMathFlags(Unsafe); unsigned i = 0; + MapVector ExternallyUsedValues; + for (auto &Pair : ExtraArgs) + ExternallyUsedValues[Pair.second] = Pair.first->getDebugLoc(); while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, ReductionOps); + V.buildTree(VL, ExternallyUsedValues, ReductionOps); if (V.shouldReorder()) { SmallVector Reversed(VL.rbegin(), VL.rend()); - V.buildTree(Reversed, ReductionOps); + V.buildTree(Reversed, ExternallyUsedValues, ReductionOps); } if (V.isTreeTinyAndNotFullyVectorizable()) break; @@ -4352,7 +4459,7 @@ // Vectorize a tree. DebugLoc Loc = cast(ReducedVals[i])->getDebugLoc(); - Value *VectorizedRoot = V.vectorizeTree(); + Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues); // Emit a reduction. Value *ReducedSubTree = @@ -4370,10 +4477,15 @@ if (VectorizedTree) { // Finish the reduction. for (; i < NumReducedVals; ++i) { - Builder.SetCurrentDebugLocation( - cast(ReducedVals[i])->getDebugLoc()); + auto *I = cast(ReducedVals[i]); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + VectorizedTree = + Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I); + } + for (auto &Pair : ExternallyUsedValues) { + Builder.SetCurrentDebugLocation(Pair.second); VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, - ReducedVals[i]); + Pair.first, "bin.extra"); } // Update users. if (ReductionPHI && !isa(ReductionPHI)) { Index: llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -97,78 +97,62 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]] -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float undef, [[CONV]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]] +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] ; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 ; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float ; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]] -; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4) to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4) to <2 x float>*), align 16 -; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 -; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP14]], [[ADD7]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 -; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP15]], [[ADD19]] -; CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP17:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8 -; CHECK-NEXT: [[TMP18:%.*]] = fmul fast <2 x float> [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP18]], i32 0 -; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP19]], [[ADD19_1]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP18]], i32 1 -; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP20]], [[ADD19_2]] -; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4 -; CHECK-NEXT: ret float [[ADD19_3]] +; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float undef, [[ADD7]] +; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]] +; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] +; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]] +; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]] +; CHECK-NEXT: store float [[BIN_EXTRA5]], float* @res, align 4 +; CHECK-NEXT: ret float [[BIN_EXTRA5]] ; ; THRESHOLD-LABEL: @bazz( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]] -; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16 +; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float undef, [[CONV]] +; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]] +; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] +; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] ; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 ; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float ; THRESHOLD-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]] -; THRESHOLD-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4) to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP12:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4) to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP12]], [[TMP11]] -; THRESHOLD-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 -; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP14]], [[ADD7]] -; THRESHOLD-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 -; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP15]], [[ADD19]] -; THRESHOLD-NEXT: [[TMP16:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP17:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP18:%.*]] = fmul fast <2 x float> [[TMP17]], [[TMP16]] -; THRESHOLD-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP18]], i32 0 -; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP19]], [[ADD19_1]] -; THRESHOLD-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP18]], i32 1 -; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP20]], [[ADD19_2]] -; THRESHOLD-NEXT: store float [[ADD19_3]], float* @res, align 4 -; THRESHOLD-NEXT: ret float [[ADD19_3]] +; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float undef, [[ADD7]] +; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]] +; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]] +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] +; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]] +; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]] +; THRESHOLD-NEXT: store float [[BIN_EXTRA5]], float* @res, align 4 +; THRESHOLD-NEXT: ret float [[BIN_EXTRA5]] ; entry: %0 = load i32, i32* @n, align 4 @@ -806,203 +790,167 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to float -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], [[CONV]] -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[ADD]] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4 -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]] ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4 -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]] ; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4 -; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP4]], [[ADD_3]] ; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4 -; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP5]], [[ADD_4]] ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_6]], align 4 -; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP6]], [[ADD_5]] ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_7]], align 4 -; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float [[TMP7]], [[ADD_6]] ; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX_8]], align 4 -; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float [[TMP8]], [[ADD_7]] ; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9 -; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX_9]], align 4 -; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float [[TMP9]], [[ADD_8]] ; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX_10]], align 4 -; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float [[TMP10]], [[ADD_9]] ; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11 -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX_11]], align 4 -; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float [[TMP11]], [[ADD_10]] ; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12 -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX_12]], align 4 -; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float [[TMP12]], [[ADD_11]] ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 -; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX_13]], align 4 -; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float [[TMP13]], [[ADD_12]] ; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 -; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX_14]], align 4 -; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float [[TMP14]], [[ADD_13]] ; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 -; CHECK-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX_15]], align 4 -; CHECK-NEXT: [[ADD_15:%.*]] = fadd fast float [[TMP15]], [[ADD_14]] ; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 -; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[ARRAYIDX_16]], align 4 -; CHECK-NEXT: [[ADD_16:%.*]] = fadd fast float [[TMP16]], [[ADD_15]] ; CHECK-NEXT: [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 -; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX_17]], align 4 -; CHECK-NEXT: [[ADD_17:%.*]] = fadd fast float [[TMP17]], [[ADD_16]] ; CHECK-NEXT: [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18 -; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX_18]], align 4 -; CHECK-NEXT: [[ADD_18:%.*]] = fadd fast float [[TMP18]], [[ADD_17]] ; CHECK-NEXT: [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19 -; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX_19]], align 4 -; CHECK-NEXT: [[ADD_19:%.*]] = fadd fast float [[TMP19]], [[ADD_18]] ; CHECK-NEXT: [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20 -; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[ARRAYIDX_20]], align 4 -; CHECK-NEXT: [[ADD_20:%.*]] = fadd fast float [[TMP20]], [[ADD_19]] ; CHECK-NEXT: [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21 -; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX_21]], align 4 -; CHECK-NEXT: [[ADD_21:%.*]] = fadd fast float [[TMP21]], [[ADD_20]] ; CHECK-NEXT: [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22 -; CHECK-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX_22]], align 4 -; CHECK-NEXT: [[ADD_22:%.*]] = fadd fast float [[TMP22]], [[ADD_21]] ; CHECK-NEXT: [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23 -; CHECK-NEXT: [[TMP23:%.*]] = load float, float* [[ARRAYIDX_23]], align 4 -; CHECK-NEXT: [[ADD_23:%.*]] = fadd fast float [[TMP23]], [[ADD_22]] ; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24 -; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX_24]], align 4 -; CHECK-NEXT: [[ADD_24:%.*]] = fadd fast float [[TMP24]], [[ADD_23]] ; CHECK-NEXT: [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25 -; CHECK-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX_25]], align 4 -; CHECK-NEXT: [[ADD_25:%.*]] = fadd fast float [[TMP25]], [[ADD_24]] ; CHECK-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26 -; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX_26]], align 4 -; CHECK-NEXT: [[ADD_26:%.*]] = fadd fast float [[TMP26]], [[ADD_25]] ; CHECK-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27 -; CHECK-NEXT: [[TMP27:%.*]] = load float, float* [[ARRAYIDX_27]], align 4 -; CHECK-NEXT: [[ADD_27:%.*]] = fadd fast float [[TMP27]], [[ADD_26]] ; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28 -; CHECK-NEXT: [[TMP28:%.*]] = load float, float* [[ARRAYIDX_28]], align 4 -; CHECK-NEXT: [[ADD_28:%.*]] = fadd fast float [[TMP28]], [[ADD_27]] ; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 -; CHECK-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX_29]], align 4 -; CHECK-NEXT: [[ADD_29:%.*]] = fadd fast float [[TMP29]], [[ADD_28]] ; CHECK-NEXT: [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; CHECK-NEXT: [[TMP30:%.*]] = load float, float* [[ARRAYIDX_30]], align 4 -; CHECK-NEXT: [[ADD_30:%.*]] = fadd fast float [[TMP30]], [[ADD_29]] ; CHECK-NEXT: [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31 -; CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX_31]], align 4 -; CHECK-NEXT: [[ADD_31:%.*]] = fadd fast float [[TMP31]], [[ADD_30]] -; CHECK-NEXT: ret float [[ADD_31]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float undef, [[CONV]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]] +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] +; CHECK-NEXT: [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]] +; CHECK-NEXT: [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]] +; CHECK-NEXT: [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]] +; CHECK-NEXT: [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]] +; CHECK-NEXT: [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]] +; CHECK-NEXT: [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]] +; CHECK-NEXT: [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]] +; CHECK-NEXT: [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]] +; CHECK-NEXT: [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]] +; CHECK-NEXT: [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]] +; CHECK-NEXT: [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]] +; CHECK-NEXT: [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]] +; CHECK-NEXT: [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]] +; CHECK-NEXT: [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]] +; CHECK-NEXT: [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]] +; CHECK-NEXT: [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]] +; CHECK-NEXT: [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]] +; CHECK-NEXT: [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]] +; CHECK-NEXT: [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]] +; CHECK-NEXT: [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]] +; CHECK-NEXT: [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]] +; CHECK-NEXT: [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]] +; CHECK-NEXT: [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]] +; CHECK-NEXT: [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]] +; CHECK-NEXT: [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]] +; CHECK-NEXT: [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]] +; CHECK-NEXT: [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> +; CHECK-NEXT: [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] +; CHECK-NEXT: [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]] +; CHECK-NEXT: ret float [[BIN_EXTRA]] ; ; THRESHOLD-LABEL: @f1( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[REM]] to float -; THRESHOLD-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4 -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], [[CONV]] -; THRESHOLD-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 1 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 -; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[ADD]] +; THRESHOLD-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; THRESHOLD-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4 -; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]] ; THRESHOLD-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 -; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4 -; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]] ; THRESHOLD-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 -; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4 -; THRESHOLD-NEXT: [[ADD_4:%.*]] = fadd fast float [[TMP4]], [[ADD_3]] ; THRESHOLD-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 -; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4 -; THRESHOLD-NEXT: [[ADD_5:%.*]] = fadd fast float [[TMP5]], [[ADD_4]] ; THRESHOLD-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_6]], align 4 -; THRESHOLD-NEXT: [[ADD_6:%.*]] = fadd fast float [[TMP6]], [[ADD_5]] ; THRESHOLD-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_7]], align 4 -; THRESHOLD-NEXT: [[ADD_7:%.*]] = fadd fast float [[TMP7]], [[ADD_6]] ; THRESHOLD-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8 -; THRESHOLD-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX_8]], align 4 -; THRESHOLD-NEXT: [[ADD_8:%.*]] = fadd fast float [[TMP8]], [[ADD_7]] ; THRESHOLD-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9 -; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX_9]], align 4 -; THRESHOLD-NEXT: [[ADD_9:%.*]] = fadd fast float [[TMP9]], [[ADD_8]] ; THRESHOLD-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10 -; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX_10]], align 4 -; THRESHOLD-NEXT: [[ADD_10:%.*]] = fadd fast float [[TMP10]], [[ADD_9]] ; THRESHOLD-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11 -; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX_11]], align 4 -; THRESHOLD-NEXT: [[ADD_11:%.*]] = fadd fast float [[TMP11]], [[ADD_10]] ; THRESHOLD-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12 -; THRESHOLD-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX_12]], align 4 -; THRESHOLD-NEXT: [[ADD_12:%.*]] = fadd fast float [[TMP12]], [[ADD_11]] ; THRESHOLD-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13 -; THRESHOLD-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX_13]], align 4 -; THRESHOLD-NEXT: [[ADD_13:%.*]] = fadd fast float [[TMP13]], [[ADD_12]] ; THRESHOLD-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14 -; THRESHOLD-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX_14]], align 4 -; THRESHOLD-NEXT: [[ADD_14:%.*]] = fadd fast float [[TMP14]], [[ADD_13]] ; THRESHOLD-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15 -; THRESHOLD-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX_15]], align 4 -; THRESHOLD-NEXT: [[ADD_15:%.*]] = fadd fast float [[TMP15]], [[ADD_14]] ; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16 -; THRESHOLD-NEXT: [[TMP16:%.*]] = load float, float* [[ARRAYIDX_16]], align 4 -; THRESHOLD-NEXT: [[ADD_16:%.*]] = fadd fast float [[TMP16]], [[ADD_15]] ; THRESHOLD-NEXT: [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17 -; THRESHOLD-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX_17]], align 4 -; THRESHOLD-NEXT: [[ADD_17:%.*]] = fadd fast float [[TMP17]], [[ADD_16]] ; THRESHOLD-NEXT: [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18 -; THRESHOLD-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX_18]], align 4 -; THRESHOLD-NEXT: [[ADD_18:%.*]] = fadd fast float [[TMP18]], [[ADD_17]] ; THRESHOLD-NEXT: [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19 -; THRESHOLD-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX_19]], align 4 -; THRESHOLD-NEXT: [[ADD_19:%.*]] = fadd fast float [[TMP19]], [[ADD_18]] ; THRESHOLD-NEXT: [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20 -; THRESHOLD-NEXT: [[TMP20:%.*]] = load float, float* [[ARRAYIDX_20]], align 4 -; THRESHOLD-NEXT: [[ADD_20:%.*]] = fadd fast float [[TMP20]], [[ADD_19]] ; THRESHOLD-NEXT: [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21 -; THRESHOLD-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX_21]], align 4 -; THRESHOLD-NEXT: [[ADD_21:%.*]] = fadd fast float [[TMP21]], [[ADD_20]] ; THRESHOLD-NEXT: [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22 -; THRESHOLD-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX_22]], align 4 -; THRESHOLD-NEXT: [[ADD_22:%.*]] = fadd fast float [[TMP22]], [[ADD_21]] ; THRESHOLD-NEXT: [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23 -; THRESHOLD-NEXT: [[TMP23:%.*]] = load float, float* [[ARRAYIDX_23]], align 4 -; THRESHOLD-NEXT: [[ADD_23:%.*]] = fadd fast float [[TMP23]], [[ADD_22]] ; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24 -; THRESHOLD-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX_24]], align 4 -; THRESHOLD-NEXT: [[ADD_24:%.*]] = fadd fast float [[TMP24]], [[ADD_23]] ; THRESHOLD-NEXT: [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25 -; THRESHOLD-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX_25]], align 4 -; THRESHOLD-NEXT: [[ADD_25:%.*]] = fadd fast float [[TMP25]], [[ADD_24]] ; THRESHOLD-NEXT: [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26 -; THRESHOLD-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX_26]], align 4 -; THRESHOLD-NEXT: [[ADD_26:%.*]] = fadd fast float [[TMP26]], [[ADD_25]] ; THRESHOLD-NEXT: [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27 -; THRESHOLD-NEXT: [[TMP27:%.*]] = load float, float* [[ARRAYIDX_27]], align 4 -; THRESHOLD-NEXT: [[ADD_27:%.*]] = fadd fast float [[TMP27]], [[ADD_26]] ; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28 -; THRESHOLD-NEXT: [[TMP28:%.*]] = load float, float* [[ARRAYIDX_28]], align 4 -; THRESHOLD-NEXT: [[ADD_28:%.*]] = fadd fast float [[TMP28]], [[ADD_27]] ; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29 -; THRESHOLD-NEXT: [[TMP29:%.*]] = load float, float* [[ARRAYIDX_29]], align 4 -; THRESHOLD-NEXT: [[ADD_29:%.*]] = fadd fast float [[TMP29]], [[ADD_28]] ; THRESHOLD-NEXT: [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30 -; THRESHOLD-NEXT: [[TMP30:%.*]] = load float, float* [[ARRAYIDX_30]], align 4 -; THRESHOLD-NEXT: [[ADD_30:%.*]] = fadd fast float [[TMP30]], [[ADD_29]] ; THRESHOLD-NEXT: [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31 -; THRESHOLD-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX_31]], align 4 -; THRESHOLD-NEXT: [[ADD_31:%.*]] = fadd fast float [[TMP31]], [[ADD_30]] -; THRESHOLD-NEXT: ret float [[ADD_31]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>* +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4 +; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float undef, [[CONV]] +; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float undef, [[ADD]] +; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] +; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] +; THRESHOLD-NEXT: [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]] +; THRESHOLD-NEXT: [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]] +; THRESHOLD-NEXT: [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]] +; THRESHOLD-NEXT: [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]] +; THRESHOLD-NEXT: [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]] +; THRESHOLD-NEXT: [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]] +; THRESHOLD-NEXT: [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]] +; THRESHOLD-NEXT: [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]] +; THRESHOLD-NEXT: [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]] +; THRESHOLD-NEXT: [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]] +; THRESHOLD-NEXT: [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]] +; THRESHOLD-NEXT: [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]] +; THRESHOLD-NEXT: [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]] +; THRESHOLD-NEXT: [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]] +; THRESHOLD-NEXT: [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]] +; THRESHOLD-NEXT: [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]] +; THRESHOLD-NEXT: [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]] +; THRESHOLD-NEXT: [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]] +; THRESHOLD-NEXT: [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]] +; THRESHOLD-NEXT: [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]] +; THRESHOLD-NEXT: [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]] +; THRESHOLD-NEXT: [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]] +; THRESHOLD-NEXT: [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]] +; THRESHOLD-NEXT: [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]] +; THRESHOLD-NEXT: [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]] +; THRESHOLD-NEXT: [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]] +; THRESHOLD-NEXT: [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]] +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> +; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> +; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; THRESHOLD-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> +; THRESHOLD-NEXT: [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]] +; THRESHOLD-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> +; THRESHOLD-NEXT: [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]] +; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0 +; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]] +; THRESHOLD-NEXT: [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]] +; THRESHOLD-NEXT: ret float [[BIN_EXTRA]] ; entry: %rem = srem i32 %a, %b @@ -1396,63 +1344,69 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00 -; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]] -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]] -; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4 -; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD5]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4 -; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4 -; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]] ; CHECK-NEXT: [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]], align 4 -; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD4_3]] ; CHECK-NEXT: [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]], align 4 -; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]] ; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]], align 4 -; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]] -; CHECK-NEXT: ret float [[ADD4_6]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float undef, [[ADD]] +; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float undef, [[ADD1]] +; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]] +; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]] +; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]] +; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]] +; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]] +; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]] +; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] +; CHECK-NEXT: ret float [[BIN_EXTRA5]] ; ; THRESHOLD-LABEL: @extra_args( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4 ; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00 -; THRESHOLD-NEXT: [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]] -; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 1 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; THRESHOLD-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]] -; THRESHOLD-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]] +; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; THRESHOLD-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4 -; THRESHOLD-NEXT: [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD5]] ; THRESHOLD-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 -; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4 -; THRESHOLD-NEXT: [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]] ; THRESHOLD-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 -; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4 -; THRESHOLD-NEXT: [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]] ; THRESHOLD-NEXT: [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 -; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]], align 4 -; THRESHOLD-NEXT: [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD4_3]] ; THRESHOLD-NEXT: [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]], align 4 -; THRESHOLD-NEXT: [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]] ; THRESHOLD-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]], align 4 -; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]] -; THRESHOLD-NEXT: ret float [[ADD4_6]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 +; THRESHOLD-NEXT: [[ADD1:%.*]] = fadd fast float undef, [[ADD]] +; THRESHOLD-NEXT: [[ADD4:%.*]] = fadd fast float undef, [[ADD1]] +; THRESHOLD-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]] +; THRESHOLD-NEXT: [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]] +; THRESHOLD-NEXT: [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]] +; THRESHOLD-NEXT: [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]] +; THRESHOLD-NEXT: [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]] +; THRESHOLD-NEXT: [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]] +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]] +; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] +; THRESHOLD-NEXT: ret float [[BIN_EXTRA5]] ; entry: %mul = mul nsw i32 %b, %a @@ -1490,67 +1444,73 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4 ; CHECK-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float ; CHECK-NEXT: [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00 ; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]] -; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]] -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4 -; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD4]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4 -; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4 -; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]] -; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]] ; CHECK-NEXT: [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]], align 4 -; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD5]] ; CHECK-NEXT: [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]], align 4 -; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]] ; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]], align 4 -; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]] -; CHECK-NEXT: ret float [[ADD4_6]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float undef, [[ADD]] +; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float undef, [[ADD1]] +; CHECK-NEXT: [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]] +; CHECK-NEXT: [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]] +; CHECK-NEXT: [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]] +; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]] +; CHECK-NEXT: [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]] +; CHECK-NEXT: [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; CHECK-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]] +; CHECK-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] +; CHECK-NEXT: ret float [[BIN_EXTRA5]] ; ; THRESHOLD-LABEL: @extra_args_no_replace( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]] ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4 ; THRESHOLD-NEXT: [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float ; THRESHOLD-NEXT: [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00 ; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]] -; THRESHOLD-NEXT: [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]] -; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 1 -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; THRESHOLD-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]] +; THRESHOLD-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1 ; THRESHOLD-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4 -; THRESHOLD-NEXT: [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD4]] ; THRESHOLD-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3 -; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4 -; THRESHOLD-NEXT: [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]] ; THRESHOLD-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4 -; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4 -; THRESHOLD-NEXT: [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]] -; THRESHOLD-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]] ; THRESHOLD-NEXT: [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5 -; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]], align 4 -; THRESHOLD-NEXT: [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD5]] ; THRESHOLD-NEXT: [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6 -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]], align 4 -; THRESHOLD-NEXT: [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]] ; THRESHOLD-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]], align 4 -; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]] -; THRESHOLD-NEXT: ret float [[ADD4_6]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>* +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4 +; THRESHOLD-NEXT: [[ADD1:%.*]] = fadd fast float undef, [[ADD]] +; THRESHOLD-NEXT: [[ADD4:%.*]] = fadd fast float undef, [[ADD1]] +; THRESHOLD-NEXT: [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]] +; THRESHOLD-NEXT: [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]] +; THRESHOLD-NEXT: [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]] +; THRESHOLD-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]] +; THRESHOLD-NEXT: [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]] +; THRESHOLD-NEXT: [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]] +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> +; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; THRESHOLD-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> +; THRESHOLD-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]] +; THRESHOLD-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0 +; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]] +; THRESHOLD-NEXT: [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]] +; THRESHOLD-NEXT: [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]] +; THRESHOLD-NEXT: ret float [[BIN_EXTRA5]] ; entry: %mul = mul nsw i32 %b, %a @@ -1588,45 +1548,59 @@ define i32 @wobble(i32 %arg, i32 %bar) { ; CHECK-LABEL: @wobble( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[X1:%.*]] = xor i32 [[ARG:%.*]], [[BAR:%.*]] -; CHECK-NEXT: [[I1:%.*]] = icmp eq i32 [[X1]], 0 -; CHECK-NEXT: [[S1:%.*]] = sext i1 [[I1]] to i32 -; CHECK-NEXT: [[X2:%.*]] = xor i32 [[ARG]], [[BAR]] -; CHECK-NEXT: [[I2:%.*]] = icmp eq i32 [[X2]], 0 -; CHECK-NEXT: [[S2:%.*]] = sext i1 [[I2]] to i32 -; CHECK-NEXT: [[X3:%.*]] = xor i32 [[ARG]], [[BAR]] -; CHECK-NEXT: [[I3:%.*]] = icmp eq i32 [[X3]], 0 -; CHECK-NEXT: [[S3:%.*]] = sext i1 [[I3]] to i32 -; CHECK-NEXT: [[X4:%.*]] = xor i32 [[ARG]], [[BAR]] -; CHECK-NEXT: [[I4:%.*]] = icmp eq i32 [[X4]], 0 -; CHECK-NEXT: [[S4:%.*]] = sext i1 [[I4]] to i32 -; CHECK-NEXT: [[R1:%.*]] = add i32 [[ARG]], [[S1]] -; CHECK-NEXT: [[R2:%.*]] = add i32 [[R1]], [[S2]] -; CHECK-NEXT: [[R3:%.*]] = add i32 [[R2]], [[S3]] -; CHECK-NEXT: [[R4:%.*]] = add i32 [[R3]], [[S4]] -; CHECK-NEXT: [[R5:%.*]] = add i32 [[R4]], [[X4]] -; CHECK-NEXT: ret i32 [[R5]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ARG]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ARG]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[BAR:%.*]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BAR]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[BAR]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[BAR]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[R1:%.*]] = add i32 [[ARG]], undef +; CHECK-NEXT: [[R2:%.*]] = add i32 [[R1]], undef +; CHECK-NEXT: [[R3:%.*]] = add i32 [[R2]], undef +; CHECK-NEXT: [[R4:%.*]] = add i32 [[R3]], undef +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP12]], [[ARG]] +; CHECK-NEXT: [[BIN_EXTRA3:%.*]] = add i32 [[BIN_EXTRA]], [[TMP9]] +; CHECK-NEXT: [[R5:%.*]] = add i32 [[R4]], undef +; CHECK-NEXT: ret i32 [[BIN_EXTRA3]] ; ; THRESHOLD-LABEL: @wobble( ; THRESHOLD-NEXT: bb: -; THRESHOLD-NEXT: [[X1:%.*]] = xor i32 [[ARG:%.*]], [[BAR:%.*]] -; THRESHOLD-NEXT: [[I1:%.*]] = icmp eq i32 [[X1]], 0 -; THRESHOLD-NEXT: [[S1:%.*]] = sext i1 [[I1]] to i32 -; THRESHOLD-NEXT: [[X2:%.*]] = xor i32 [[ARG]], [[BAR]] -; THRESHOLD-NEXT: [[I2:%.*]] = icmp eq i32 [[X2]], 0 -; THRESHOLD-NEXT: [[S2:%.*]] = sext i1 [[I2]] to i32 -; THRESHOLD-NEXT: [[X3:%.*]] = xor i32 [[ARG]], [[BAR]] -; THRESHOLD-NEXT: [[I3:%.*]] = icmp eq i32 [[X3]], 0 -; THRESHOLD-NEXT: [[S3:%.*]] = sext i1 [[I3]] to i32 -; THRESHOLD-NEXT: [[X4:%.*]] = xor i32 [[ARG]], [[BAR]] -; THRESHOLD-NEXT: [[I4:%.*]] = icmp eq i32 [[X4]], 0 -; THRESHOLD-NEXT: [[S4:%.*]] = sext i1 [[I4]] to i32 -; THRESHOLD-NEXT: [[R1:%.*]] = add i32 [[ARG]], [[S1]] -; THRESHOLD-NEXT: [[R2:%.*]] = add i32 [[R1]], [[S2]] -; THRESHOLD-NEXT: [[R3:%.*]] = add i32 [[R2]], [[S3]] -; THRESHOLD-NEXT: [[R4:%.*]] = add i32 [[R3]], [[S4]] -; THRESHOLD-NEXT: [[R5:%.*]] = add i32 [[R4]], [[X4]] -; THRESHOLD-NEXT: ret i32 [[R5]] +; THRESHOLD-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0 +; THRESHOLD-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1 +; THRESHOLD-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ARG]], i32 2 +; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ARG]], i32 3 +; THRESHOLD-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[BAR:%.*]], i32 0 +; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BAR]], i32 1 +; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[BAR]], i32 2 +; THRESHOLD-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[BAR]], i32 3 +; THRESHOLD-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]] +; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; THRESHOLD-NEXT: [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer +; THRESHOLD-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32> +; THRESHOLD-NEXT: [[R1:%.*]] = add i32 [[ARG]], undef +; THRESHOLD-NEXT: [[R2:%.*]] = add i32 [[R1]], undef +; THRESHOLD-NEXT: [[R3:%.*]] = add i32 [[R2]], undef +; THRESHOLD-NEXT: [[R4:%.*]] = add i32 [[R3]], undef +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; THRESHOLD-NEXT: [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; THRESHOLD-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 +; THRESHOLD-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP12]], [[ARG]] +; THRESHOLD-NEXT: [[BIN_EXTRA3:%.*]] = add i32 [[BIN_EXTRA]], [[TMP9]] +; THRESHOLD-NEXT: [[R5:%.*]] = add i32 [[R4]], undef +; THRESHOLD-NEXT: ret i32 [[BIN_EXTRA3]] ; bb: %x1 = xor i32 %arg, %bar