diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h --- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -45,6 +45,7 @@ class TargetLibraryInfo; class TargetTransformInfo; class Value; +class WeakTrackingVH; /// A private "module" namespace for types and utilities used by this pass. /// These are implementation details and should not be used by clients. @@ -110,20 +111,25 @@ /// collected in GEPs. bool vectorizeGEPIndices(BasicBlock *BB, slpvectorizer::BoUpSLP &R); - /// Try to find horizontal reduction or otherwise vectorize a chain of binary - /// operators. + /// Try to find horizontal reduction or otherwise, collect instructions + /// for postponed vectorization attempts. + /// \a P if not null designates phi node the reduction is fed into + /// (with reduction operators \a V or one of its operands, in a basic block + /// \a BB). + /// \returns true if a horizontal reduction was matched and reduced. + /// \returns false if \a V is null or not an instruction, + /// or a horizontal reduction was not matched or not possible. + bool + vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, + slpvectorizer::BoUpSLP &R, TargetTransformInfo *TTI, + SmallVectorImpl &PostponedInsts); + + /// The convinience method version that also tries to vectorize postponed + /// binary operations. bool vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, slpvectorizer::BoUpSLP &R, TargetTransformInfo *TTI); - /// Try to vectorize trees that start at insertvalue instructions. - bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB, - slpvectorizer::BoUpSLP &R); - - /// Try to vectorize trees that start at insertelement instructions. - bool vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, - slpvectorizer::BoUpSLP &R); - /// Tries to vectorize constructs started from CmpInst, InsertValueInst or /// InsertElementInst instructions. bool vectorizeSimpleInstructions(InstSetVector &Instructions, BasicBlock *BB, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -11559,6 +11559,8 @@ SmallVectorImpl &BuildVectorOpds, SmallVectorImpl &InsertElts, unsigned OperandOffset) { + assert((isa(LastInsertInst)) && + "Expected insertelement or insertvalue instruction!"); do { Value *InsertedOperand = LastInsertInst->getOperand(1); Optional OperandIndex = @@ -11597,13 +11599,10 @@ TargetTransformInfo *TTI, SmallVectorImpl &BuildVectorOpds, SmallVectorImpl &InsertElts) { - - assert((isa(LastInsertInst) || - isa(LastInsertInst)) && - "Expected insertelement or insertvalue instruction!"); - assert((BuildVectorOpds.empty() && InsertElts.empty()) && "Expected empty result vectors!"); + if (!isa(LastInsertInst)) + return false; Optional AggregateSize = getAggregateSize(LastInsertInst); if (!AggregateSize) @@ -11689,28 +11688,19 @@ return false; } -/// Attempt to reduce a horizontal reduction. -/// If it is legal to match a horizontal reduction feeding the phi node \a P -/// with reduction operators \a Root (or one of its operands) in a basic block -/// \a BB, then check if it can be done. If horizontal reduction is not found -/// and root instruction is a binary operation, vectorization of the operands is -/// attempted. -/// \returns true if a horizontal reduction was matched and reduced or operands -/// of one of the binary instruction were vectorized. -/// \returns false if a horizontal reduction was not matched (or not possible) -/// or no vectorization of any binary operation feeding \a Root instruction was -/// performed. -static bool tryToVectorizeHorReductionOrInstOperands( - PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, - TargetTransformInfo *TTI, ScalarEvolution &SE, const DataLayout &DL, - const TargetLibraryInfo &TLI, - const function_ref Vectorize) { +bool SLPVectorizerPass::vectorizeRootInstruction( + PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI, + SmallVectorImpl &PostponedInsts) { if (!ShouldVectorizeHor) return false; + auto *Root = dyn_cast_or_null(V); if (!Root) return false; + if (!isa(Root)) + P = nullptr; + if (Root->getParent() != BB || isa(Root)) return false; // Start analysis starting from Root instruction. If horizontal reduction is @@ -11722,24 +11712,21 @@ // horizontal reduction. // Interrupt the process if the Root instruction itself was vectorized or all // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. - // Skip the analysis of CmpInsts. Compiler implements postanalysis of the - // CmpInsts so we can skip extra attempts in - // tryToVectorizeHorReductionOrInstOperands and save compile time. + // If a horizintal reduction was not matched or vectorized we collect + // instructions for possible later attempts for vectorization. std::queue> Stack; Stack.emplace(Root, 0); SmallPtrSet VisitedInstrs; - SmallVector PostponedInsts; bool Res = false; - auto &&TryToReduce = [TTI, &SE, &DL, &P, &R, &TLI](Instruction *Inst, - Value *&B0, - Value *&B1) -> Value * { + auto &&TryToReduce = [this, TTI, &P, &R](Instruction *Inst, Value *&B0, + Value *&B1) -> Value * { if (R.isAnalyzedReductionRoot(Inst)) return nullptr; bool IsBinop = matchRdxBop(Inst, B0, B1); bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); if (IsBinop || IsSelect) { HorizontalReduction HorRdx; - if (HorRdx.matchAssociativeReduction(P, Inst, SE, DL, TLI)) + if (HorRdx.matchAssociativeReduction(P, Inst, *SE, *DL, *TLI)) return HorRdx.tryToReduce(R, TTI); } return nullptr; @@ -11781,9 +11768,8 @@ // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. P = nullptr; - // Do not try to vectorize CmpInst operands, this is done separately. - // Final attempt for binop args vectorization should happen after the loop - // to try to find reductions. + // Do not collect CmpInst or InsertElementInst/InsertValueInst as their + // analysis is done separately. if (!isa(Inst)) PostponedInsts.push_back(Inst); } @@ -11801,61 +11787,20 @@ !R.isDeleted(I) && I->getParent() == BB) Stack.emplace(I, Level); } - // Try to vectorized binops where reductions were not found. - for (Value *V : PostponedInsts) - if (auto *Inst = dyn_cast(V)) - if (!R.isDeleted(Inst)) - Res |= Vectorize(Inst, R); return Res; } bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI) { - auto *I = dyn_cast_or_null(V); - if (!I) - return false; - - if (!isa(I)) - P = nullptr; - // Try to match and vectorize a horizontal reduction. - auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool { - return tryToVectorize(I, R); - }; - return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI, *SE, *DL, - *TLI, ExtraVectorization); -} - -bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, - BasicBlock *BB, BoUpSLP &R) { - const DataLayout &DL = BB->getModule()->getDataLayout(); - if (!R.canMapToVector(IVI->getType(), DL)) - return false; - - SmallVector BuildVectorOpds; - SmallVector BuildVectorInsts; - if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts)) - return false; - - LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); - // Aggregate value is unlikely to be processed in vector register. - return tryToVectorizeList(BuildVectorOpds, R); -} - -bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, - BasicBlock *BB, BoUpSLP &R) { - SmallVector BuildVectorInsts; - SmallVector BuildVectorOpds; - SmallVector Mask; - if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || - (llvm::all_of( - BuildVectorOpds, - [](Value *V) { return isa(V); }) && - isFixedVectorShuffle(BuildVectorOpds, Mask))) - return false; - - LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); - return tryToVectorizeList(BuildVectorInsts, R); + SmallVector PostponedInsts; + bool Res = vectorizeRootInstruction(P, V, BB, R, TTI, PostponedInsts); + // Try to vectorize binops where reductions were not found. + for (Value *Op : PostponedInsts) + if (auto *Inst = dyn_cast(Op)) + if (!R.isDeleted(Inst)) + Res |= tryToVectorize(Inst, R); + return Res; } template @@ -11991,16 +11936,48 @@ for (auto *I : reverse(Instructions)) { if (R.isDeleted(I)) continue; - if (auto *LastInsertValue = dyn_cast(I)) { - OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); - } else if (auto *LastInsertElem = dyn_cast(I)) { - OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); - } else if (isa(I)) { + if (isa(I)) { PostponedCmps.push_back(I); continue; } - // Try to find reductions in buildvector sequnces. - OpsChanged |= vectorizeRootInstruction(nullptr, I, BB, R, TTI); + SmallVector BuildVectorOpds; + SmallVector BuildVectorInsts; + if (!findBuildAggregate(I, TTI, BuildVectorOpds, BuildVectorInsts)) + continue; + + // Try to find reductions in buildvector sequences. + SmallVector PostponedInsts; + for (Value *Op : BuildVectorOpds) + OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI, + PostponedInsts); + + if (isa(I)) { + const DataLayout &DL = BB->getModule()->getDataLayout(); + if (!R.canMapToVector(I->getType(), DL)) + continue; + + LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *I << "\n"); + // Aggregate value is unlikely to be processed in vector register. + OpsChanged |= tryToVectorizeList(BuildVectorOpds, R); + + } + else if (isa(I)) { + SmallVector Mask; + if (all_of(BuildVectorOpds, + [](Value *V) { + return isa(V); + }) && + isFixedVectorShuffle(BuildVectorOpds, Mask)) + continue; + + LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *I << "\n"); + OpsChanged |= tryToVectorizeList(BuildVectorInsts, R); + } + // Try to vectorize postponed binops where reductions were not found. + for (Value *V : PostponedInsts) + if (auto *Inst = dyn_cast(V)) + if (!R.isDeleted(Inst)) + OpsChanged |= tryToVectorize(Inst, R); } if (AtTerminator) { // Try to find reductions first. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll @@ -10,102 +10,23 @@ define void @test(double* nocapture readonly %arg, double* nocapture readonly %arg1, double* nocapture %arg2) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, double* [[ARG:%.*]], i64 1 -; CHECK-NEXT: [[LD1_0:%.*]] = load double, double* [[GEP1_0]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x double*> poison, double* [[ARG:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x double*> [[TMP0]], <8 x double*> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, <8 x double*> [[SHUFFLE]], <8 x i64> ; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, double* [[ARG1:%.*]], i64 16 -; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 3 -; CHECK-NEXT: [[LD1_1:%.*]] = load double, double* [[GEP1_1]], align 8 -; CHECK-NEXT: [[GEP0_1:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 1 -; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 17 -; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 5 -; CHECK-NEXT: [[LD1_2:%.*]] = load double, double* [[GEP1_2]], align 8 -; CHECK-NEXT: [[GEP0_2:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 2 -; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 18 -; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 7 -; CHECK-NEXT: [[LD1_3:%.*]] = load double, double* [[GEP1_3]], align 8 -; CHECK-NEXT: [[GEP0_3:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 3 -; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 19 -; CHECK-NEXT: [[GEP1_4:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 9 -; CHECK-NEXT: [[LD1_4:%.*]] = load double, double* [[GEP1_4]], align 8 -; CHECK-NEXT: [[GEP0_4:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 4 -; CHECK-NEXT: [[GEP2_4:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 20 -; CHECK-NEXT: [[GEP1_5:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 11 -; CHECK-NEXT: [[LD1_5:%.*]] = load double, double* [[GEP1_5]], align 8 -; CHECK-NEXT: [[GEP0_5:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 5 -; CHECK-NEXT: [[GEP2_5:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 21 -; CHECK-NEXT: [[GEP1_6:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 13 -; CHECK-NEXT: [[LD1_6:%.*]] = load double, double* [[GEP1_6]], align 8 -; CHECK-NEXT: [[GEP0_6:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 6 -; CHECK-NEXT: [[GEP2_6:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 22 -; CHECK-NEXT: [[GEP1_7:%.*]] = getelementptr inbounds double, double* [[ARG]], i64 15 -; CHECK-NEXT: [[LD1_7:%.*]] = load double, double* [[GEP1_7]], align 8 -; CHECK-NEXT: [[GEP0_7:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 7 -; CHECK-NEXT: [[GEP2_7:%.*]] = getelementptr inbounds double, double* [[ARG1]], i64 23 -; CHECK-NEXT: [[LD0_0:%.*]] = load double, double* [[ARG1]], align 8 -; CHECK-NEXT: [[LD2_0:%.*]] = load double, double* [[GEP2_0]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD0_0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD2_0]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[LD1_0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[LD1_0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[LD0_1:%.*]] = load double, double* [[GEP0_1]], align 8 -; CHECK-NEXT: [[LD2_1:%.*]] = load double, double* [[GEP2_1]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[LD0_1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[LD2_1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[LD1_1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[LD1_1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] -; CHECK-NEXT: [[LD0_2:%.*]] = load double, double* [[GEP0_2]], align 8 -; CHECK-NEXT: [[LD2_2:%.*]] = load double, double* [[GEP2_2]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[LD0_2]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[LD2_2]], i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[LD1_2]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[LD1_2]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP15]] -; CHECK-NEXT: [[LD0_3:%.*]] = load double, double* [[GEP0_3]], align 8 -; CHECK-NEXT: [[LD2_3:%.*]] = load double, double* [[GEP2_3]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x double> poison, double [[LD0_3]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x double> [[TMP17]], double [[LD2_3]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[LD1_3]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[LD1_3]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <2 x double> [[TMP18]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd fast <2 x double> [[TMP16]], [[TMP21]] -; CHECK-NEXT: [[LD0_4:%.*]] = load double, double* [[GEP0_4]], align 8 -; CHECK-NEXT: [[LD2_4:%.*]] = load double, double* [[GEP2_4]], align 8 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x double> poison, double [[LD0_4]], i32 0 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x double> [[TMP23]], double [[LD2_4]], i32 1 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x double> poison, double [[LD1_4]], i32 0 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[LD1_4]], i32 1 -; CHECK-NEXT: [[TMP27:%.*]] = fmul fast <2 x double> [[TMP24]], [[TMP26]] -; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <2 x double> [[TMP22]], [[TMP27]] -; CHECK-NEXT: [[LD0_5:%.*]] = load double, double* [[GEP0_5]], align 8 -; CHECK-NEXT: [[LD2_5:%.*]] = load double, double* [[GEP2_5]], align 8 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> poison, double [[LD0_5]], i32 0 -; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x double> [[TMP29]], double [[LD2_5]], i32 1 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[LD1_5]], i32 0 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x double> [[TMP31]], double [[LD1_5]], i32 1 -; CHECK-NEXT: [[TMP33:%.*]] = fmul fast <2 x double> [[TMP30]], [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = fadd fast <2 x double> [[TMP28]], [[TMP33]] -; CHECK-NEXT: [[LD0_6:%.*]] = load double, double* [[GEP0_6]], align 8 -; CHECK-NEXT: [[LD2_6:%.*]] = load double, double* [[GEP2_6]], align 8 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <2 x double> poison, double [[LD0_6]], i32 0 -; CHECK-NEXT: [[TMP36:%.*]] = insertelement <2 x double> [[TMP35]], double [[LD2_6]], i32 1 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <2 x double> poison, double [[LD1_6]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x double> [[TMP37]], double [[LD1_6]], i32 1 -; CHECK-NEXT: [[TMP39:%.*]] = fmul fast <2 x double> [[TMP36]], [[TMP38]] -; CHECK-NEXT: [[TMP40:%.*]] = fadd fast <2 x double> [[TMP34]], [[TMP39]] -; CHECK-NEXT: [[LD0_7:%.*]] = load double, double* [[GEP0_7]], align 8 -; CHECK-NEXT: [[LD2_7:%.*]] = load double, double* [[GEP2_7]], align 8 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x double> poison, double [[LD0_7]], i32 0 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <2 x double> [[TMP41]], double [[LD2_7]], i32 1 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <2 x double> poison, double [[LD1_7]], i32 0 -; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x double> [[TMP43]], double [[LD1_7]], i32 1 -; CHECK-NEXT: [[TMP45:%.*]] = fmul fast <2 x double> [[TMP42]], [[TMP44]] -; CHECK-NEXT: [[TMP46:%.*]] = fadd fast <2 x double> [[TMP40]], [[TMP45]] +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP1]], i32 8, <8 x i1> , <8 x double> undef) +; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[ARG1]] to <8 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, <8 x double>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[GEP2_0]] to <8 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <8 x double>, <8 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <8 x double> [[TMP8]], [[TMP2]] +; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP9]]) +; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i64 0 +; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP10]], i64 1 ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, double* [[ARG2:%.*]], <2 x i64> -; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> [[TMP46]], <2 x double*> [[P]], i32 8, <2 x i1> ) +; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> [[I143]], <2 x double*> [[P]], i32 8, <2 x i1> ) ; CHECK-NEXT: ret void ; entry: