Index: llvm/include/llvm/Passes/PassBuilder.h =================================================================== --- llvm/include/llvm/Passes/PassBuilder.h +++ llvm/include/llvm/Passes/PassBuilder.h @@ -710,8 +710,7 @@ void addRequiredLTOPreLinkPasses(ModulePassManager &MPM); - void addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM, - bool IsLTO); + void addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM); static Optional> parsePipelineText(StringRef Text); Index: llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h =================================================================== --- llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -218,7 +218,7 @@ void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM); void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS); void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM); - void addVectorPasses(legacy::PassManagerBase &PM, bool IsLTO); + void addVectorPasses(legacy::PassManagerBase &PM); public: /// populateFunctionPassManager - This fills in the function pass manager, Index: llvm/lib/Passes/PassBuilder.cpp =================================================================== --- llvm/lib/Passes/PassBuilder.cpp +++ llvm/lib/Passes/PassBuilder.cpp @@ -1201,34 +1201,30 @@ return MPM; } -/// FIXME: Should LTO cause any differences to this set of passes? void PassBuilder::addVectorPasses(OptimizationLevel Level, - FunctionPassManager &FPM, bool IsLTO) { + FunctionPassManager &FPM) { FPM.addPass(LoopVectorizePass( LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); - if (IsLTO) { - // The vectorizer may have significantly shortened a loop body; unroll - // again. Unroll small loops to hide loop backedge latency and saturate any - // parallel execution resources of an out-of-order processor. We also then - // need to clean up redundancies and loop invariant code. - // FIXME: It would be really good to use a loop-integrated instruction - // combiner for cleanup here so that the unrolling and LICM can be pipelined - // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && PTO.LoopUnrolling) - FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel())); - FPM.addPass(LoopUnrollPass(LoopUnrollOptions( - Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, - PTO.ForgetAllSCEVInLoopUnroll))); - FPM.addPass(WarnMissedTransformationsPass()); - } - - if (!IsLTO) { - // Eliminate loads by forwarding stores from the previous iteration to loads - // of the current iteration. - FPM.addPass(LoopLoadEliminationPass()); - } + // The vectorizer may have significantly shortened a loop body; unroll + // again. Unroll small loops to hide loop backedge latency and saturate any + // parallel execution resources of an out-of-order processor. We also then + // need to clean up redundancies and loop invariant code. + // FIXME: It would be really good to use a loop-integrated instruction + // combiner for cleanup here so that the unrolling and LICM can be pipelined + // across the loop nests. + // We do UnrollAndJam in a separate LPM to ensure it happens before unroll + if (EnableUnrollAndJam && PTO.LoopUnrolling) + FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel())); + FPM.addPass(LoopUnrollPass(LoopUnrollOptions( + Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, + PTO.ForgetAllSCEVInLoopUnroll))); + FPM.addPass(WarnMissedTransformationsPass()); + + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + FPM.addPass(LoopLoadEliminationPass()); + // Cleanup after the loop optimization passes. FPM.addPass(InstCombinePass()); @@ -1270,11 +1266,6 @@ .needCanonicalLoops(false) .hoistCommonInsts(true) .sinkCommonInsts(true))); - if (IsLTO) { - FPM.addPass(SCCPPass()); - FPM.addPass(InstCombinePass()); - FPM.addPass(BDCEPass()); - } // Optimize parallel scalar instruction chains into SIMD instructions. if (PTO.SLPVectorization) { @@ -1286,31 +1277,7 @@ // Enhance/cleanup vector code. FPM.addPass(VectorCombinePass()); - if (IsLTO) { - // After vectorization, assume intrinsics may tell us more about pointer - // alignments. - FPM.addPass(AlignmentFromAssumptionsPass()); - } - FPM.addPass(InstCombinePass()); - - if (!IsLTO) { - // The vectorizer may have significantly shortened a loop body; unroll - // again. Unroll small loops to hide loop backedge latency and saturate any - // parallel execution resources of an out-of-order processor. We also then - // need to clean up redundancies and loop invariant code. - // FIXME: It would be really good to use a loop-integrated instruction - // combiner for cleanup here so that the unrolling and LICM can be pipelined - // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && PTO.LoopUnrolling) - FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel())); - FPM.addPass(LoopUnrollPass(LoopUnrollOptions( - Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, - PTO.ForgetAllSCEVInLoopUnroll))); - FPM.addPass(WarnMissedTransformationsPass()); - FPM.addPass(InstCombinePass()); - } } ModulePassManager @@ -1407,7 +1374,7 @@ // from the TargetLibraryInfo. OptimizePM.addPass(InjectTLIMappings()); - addVectorPasses(Level, OptimizePM, /* IsLTO */ false); + addVectorPasses(Level, OptimizePM); OptimizePM.addPass(RequireAnalysisPass()); OptimizePM.addPass(createFunctionToLoopPassAdaptor( @@ -1865,7 +1832,7 @@ MainFPM.addPass(LoopDistributePass()); - addVectorPasses(Level, MainFPM, /* IsLTO */ true); + addVectorPasses(Level, MainFPM); invokePeepholeEPCallbacks(MainFPM, Level); MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -521,32 +521,27 @@ MPM.add(createControlHeightReductionLegacyPass()); } -/// FIXME: Should LTO cause any differences to this set of passes? -void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM, - bool IsLTO) { +void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM) { PM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize)); - if (IsLTO) { - // The vectorizer may have significantly shortened a loop body; unroll - // again. Unroll small loops to hide loop backedge latency and saturate any - // parallel execution resources of an out-of-order processor. We also then - // need to clean up redundancies and loop invariant code. - // FIXME: It would be really good to use a loop-integrated instruction - // combiner for cleanup here so that the unrolling and LICM can be pipelined - // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && !DisableUnrollLoops) - PM.add(createLoopUnrollAndJamPass(OptLevel)); - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - PM.add(createWarnMissedTransformationsPass()); - } + // The vectorizer may have significantly shortened a loop body; unroll + // again. Unroll small loops to hide loop backedge latency and saturate any + // parallel execution resources of an out-of-order processor. We also then + // need to clean up redundancies and loop invariant code. + // FIXME: It would be really good to use a loop-integrated instruction + // combiner for cleanup here so that the unrolling and LICM can be pipelined + // across the loop nests. + // We do UnrollAndJam in a separate LPM to ensure it happens before unroll + if (EnableUnrollAndJam && !DisableUnrollLoops) + PM.add(createLoopUnrollAndJamPass(OptLevel)); + PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, + ForgetAllSCEVInLoopUnroll)); + PM.add(createWarnMissedTransformationsPass()); + + // Eliminate loads by forwarding stores from the previous iteration to loads + // of the current iteration. + PM.add(createLoopLoadEliminationPass()); - if (!IsLTO) { - // Eliminate loads by forwarding stores from the previous iteration to loads - // of the current iteration. - PM.add(createLoopLoadEliminationPass()); - } // Cleanup after the loop optimization passes. PM.add(createInstructionCombiningPass()); @@ -582,12 +577,6 @@ .hoistCommonInsts(true) .sinkCommonInsts(true))); - if (IsLTO) { - PM.add(createSCCPPass()); // Propagate exposed constants - PM.add(createInstructionCombiningPass()); // Clean up again - PM.add(createBitTrackingDCEPass()); - } - // Optimize parallel scalar instruction chains into SIMD instructions. if (SLPVectorize) { PM.add(createSLPVectorizerPass()); @@ -597,31 +586,8 @@ // Enhance/cleanup vector code. PM.add(createVectorCombinePass()); - - if (IsLTO) { - // After vectorization, assume intrinsics may tell us more about pointer - // alignments. - PM.add(createAlignmentFromAssumptionsPass()); - } addExtensionsToPM(EP_Peephole, PM); PM.add(createInstructionCombiningPass()); - - if (!IsLTO) { - // The vectorizer may have significantly shortened a loop body; unroll - // again. Unroll small loops to hide loop backedge latency and saturate any - // parallel execution resources of an out-of-order processor. We also then - // need to clean up redundancies and loop invariant code. - // FIXME: It would be really good to use a loop-integrated instruction - // combiner for cleanup here so that the unrolling and LICM can be pipelined - // across the loop nests. - // We do UnrollAndJam in a separate LPM to ensure it happens before unroll - if (EnableUnrollAndJam && !DisableUnrollLoops) - PM.add(createLoopUnrollAndJamPass(OptLevel)); - PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops, - ForgetAllSCEVInLoopUnroll)); - if (!DisableUnrollLoops) - PM.add(createInstructionCombiningPass()); - } } void PassManagerBuilder::populateModulePassManager( @@ -895,7 +861,7 @@ // llvm.loop.distribute=true or when -enable-loop-distribute is specified. MPM.add(createLoopDistributePass()); - addVectorPasses(MPM, /* IsLTO */ false); + addVectorPasses(MPM); if (!DisableUnrollLoops) { // Runtime unrolling will introduce runtime check in loop prologue. If the @@ -1120,7 +1086,7 @@ ForgetAllSCEVInLoopUnroll)); PM.add(createLoopDistributePass()); - addVectorPasses(PM, /* IsLTO */ true); + addVectorPasses(PM); addExtensionsToPM(EP_Peephole, PM); Index: llvm/test/CodeGen/AMDGPU/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/opt-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/opt-pipeline.ll @@ -234,11 +234,17 @@ ; GCN-O1-NEXT: Inject TLI Mappings ; GCN-O1-NEXT: Loop Vectorization ; GCN-O1-NEXT: Canonicalize natural loops -; GCN-O1-NEXT: Scalar Evolution Analysis +; GCN-O1-NEXT: LCSSA Verifier +; GCN-O1-NEXT: Loop-Closed SSA Form Pass ; GCN-O1-NEXT: Function Alias Analysis Results -; GCN-O1-NEXT: Loop Access Analysis +; GCN-O1-NEXT: Scalar Evolution Analysis +; GCN-O1-NEXT: Loop Pass Manager +; GCN-O1-NEXT: Unroll loops ; GCN-O1-NEXT: Lazy Branch Probability Analysis ; GCN-O1-NEXT: Lazy Block Frequency Analysis +; GCN-O1-NEXT: Optimization Remark Emitter +; GCN-O1-NEXT: Warn about non-applied transformations +; GCN-O1-NEXT: Loop Access Analysis ; GCN-O1-NEXT: Loop Load Elimination ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O1-NEXT: Function Alias Analysis Results @@ -256,16 +262,6 @@ ; GCN-O1-NEXT: Lazy Block Frequency Analysis ; GCN-O1-NEXT: Optimization Remark Emitter ; GCN-O1-NEXT: Combine redundant instructions -; GCN-O1-NEXT: Canonicalize natural loops -; GCN-O1-NEXT: LCSSA Verifier -; GCN-O1-NEXT: Loop-Closed SSA Form Pass -; GCN-O1-NEXT: Scalar Evolution Analysis -; GCN-O1-NEXT: Loop Pass Manager -; GCN-O1-NEXT: Unroll loops -; GCN-O1-NEXT: Lazy Branch Probability Analysis -; GCN-O1-NEXT: Lazy Block Frequency Analysis -; GCN-O1-NEXT: Optimization Remark Emitter -; GCN-O1-NEXT: Combine redundant instructions ; GCN-O1-NEXT: Memory SSA ; GCN-O1-NEXT: Canonicalize natural loops ; GCN-O1-NEXT: LCSSA Verifier @@ -581,11 +577,17 @@ ; GCN-O2-NEXT: Inject TLI Mappings ; GCN-O2-NEXT: Loop Vectorization ; GCN-O2-NEXT: Canonicalize natural loops -; GCN-O2-NEXT: Scalar Evolution Analysis +; GCN-O2-NEXT: LCSSA Verifier +; GCN-O2-NEXT: Loop-Closed SSA Form Pass ; GCN-O2-NEXT: Function Alias Analysis Results -; GCN-O2-NEXT: Loop Access Analysis +; GCN-O2-NEXT: Scalar Evolution Analysis +; GCN-O2-NEXT: Loop Pass Manager +; GCN-O2-NEXT: Unroll loops ; GCN-O2-NEXT: Lazy Branch Probability Analysis ; GCN-O2-NEXT: Lazy Block Frequency Analysis +; GCN-O2-NEXT: Optimization Remark Emitter +; GCN-O2-NEXT: Warn about non-applied transformations +; GCN-O2-NEXT: Loop Access Analysis ; GCN-O2-NEXT: Loop Load Elimination ; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O2-NEXT: Function Alias Analysis Results @@ -608,16 +610,6 @@ ; GCN-O2-NEXT: Optimize scalar/vector ops ; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Combine redundant instructions -; GCN-O2-NEXT: Canonicalize natural loops -; GCN-O2-NEXT: LCSSA Verifier -; GCN-O2-NEXT: Loop-Closed SSA Form Pass -; GCN-O2-NEXT: Scalar Evolution Analysis -; GCN-O2-NEXT: Loop Pass Manager -; GCN-O2-NEXT: Unroll loops -; GCN-O2-NEXT: Lazy Branch Probability Analysis -; GCN-O2-NEXT: Lazy Block Frequency Analysis -; GCN-O2-NEXT: Optimization Remark Emitter -; GCN-O2-NEXT: Combine redundant instructions ; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: Canonicalize natural loops ; GCN-O2-NEXT: LCSSA Verifier @@ -940,11 +932,17 @@ ; GCN-O3-NEXT: Inject TLI Mappings ; GCN-O3-NEXT: Loop Vectorization ; GCN-O3-NEXT: Canonicalize natural loops -; GCN-O3-NEXT: Scalar Evolution Analysis +; GCN-O3-NEXT: LCSSA Verifier +; GCN-O3-NEXT: Loop-Closed SSA Form Pass ; GCN-O3-NEXT: Function Alias Analysis Results -; GCN-O3-NEXT: Loop Access Analysis +; GCN-O3-NEXT: Scalar Evolution Analysis +; GCN-O3-NEXT: Loop Pass Manager +; GCN-O3-NEXT: Unroll loops ; GCN-O3-NEXT: Lazy Branch Probability Analysis ; GCN-O3-NEXT: Lazy Block Frequency Analysis +; GCN-O3-NEXT: Optimization Remark Emitter +; GCN-O3-NEXT: Warn about non-applied transformations +; GCN-O3-NEXT: Loop Access Analysis ; GCN-O3-NEXT: Loop Load Elimination ; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) ; GCN-O3-NEXT: Function Alias Analysis Results @@ -967,16 +965,6 @@ ; GCN-O3-NEXT: Optimize scalar/vector ops ; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Combine redundant instructions -; GCN-O3-NEXT: Canonicalize natural loops -; GCN-O3-NEXT: LCSSA Verifier -; GCN-O3-NEXT: Loop-Closed SSA Form Pass -; GCN-O3-NEXT: Scalar Evolution Analysis -; GCN-O3-NEXT: Loop Pass Manager -; GCN-O3-NEXT: Unroll loops -; GCN-O3-NEXT: Lazy Branch Probability Analysis -; GCN-O3-NEXT: Lazy Block Frequency Analysis -; GCN-O3-NEXT: Optimization Remark Emitter -; GCN-O3-NEXT: Combine redundant instructions ; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: Canonicalize natural loops ; GCN-O3-NEXT: LCSSA Verifier Index: llvm/test/Other/new-pm-defaults.ll =================================================================== --- llvm/test/Other/new-pm-defaults.ll +++ llvm/test/Other/new-pm-defaults.ll @@ -234,6 +234,8 @@ ; CHECK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis +; CHECK-O-NEXT: Running pass: LoopUnrollPass +; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass @@ -243,9 +245,6 @@ ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass ; CHECK-O-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass -; CHECK-O-NEXT: Running pass: LoopUnrollPass -; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass -; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Starting llvm::Function pass manager run. ; CHECK-O-NEXT: Running pass: LoopSimplifyPass Index: llvm/test/Other/new-pm-lto-defaults.ll =================================================================== --- llvm/test/Other/new-pm-lto-defaults.ll +++ llvm/test/Other/new-pm-lto-defaults.ll @@ -120,16 +120,14 @@ ; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo ; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo ; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo +; CHECK-O23SZ-NEXT: Running pass: LoopLoadEliminationPass on foo +; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on Loop ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo ; CHECK-O23SZ-NEXT: Running pass: SimplifyCFGPass on foo -; CHECK-O23SZ-NEXT: Running pass: SCCPPass on foo -; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo -; CHECK-O23SZ-NEXT: Running pass: BDCEPass on foo ; CHECK-O2-NEXT: Running pass: SLPVectorizerPass on foo ; CHECK-O3-NEXT: Running pass: SLPVectorizerPass on foo ; CHECK-OS-NEXT: Running pass: SLPVectorizerPass on foo ; CHECK-O23SZ-NEXT: Running pass: VectorCombinePass on foo -; CHECK-O23SZ-NEXT: Running pass: AlignmentFromAssumptionsPass on foo ; CHECK-O23SZ-NEXT: Running pass: InstCombinePass on foo ; CHECK-EP-Peephole-NEXT: Running pass: NoOpFunctionPass on foo ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass on foo Index: llvm/test/Other/new-pm-thinlto-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-defaults.ll +++ llvm/test/Other/new-pm-thinlto-defaults.ll @@ -221,6 +221,8 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass ; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis ; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis +; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass +; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass @@ -230,9 +232,6 @@ ; CHECK-POSTLINK-Os-NEXT: Running pass: SLPVectorizerPass ; CHECK-POSTLINK-O-NEXT: Running pass: VectorCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass -; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass -; CHECK-POSTLINK-O-NEXT: Running pass: WarnMissedTransformationsPass -; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-POSTLINK-O-NEXT: Starting llvm::Function pass manager run ; CHECK-POSTLINK-O-NEXT: Running pass: LoopSimplifyPass Index: llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -190,6 +190,8 @@ ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-O-NEXT: Running pass: LoopUnrollPass +; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass @@ -199,9 +201,6 @@ ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass ; CHECK-O-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass -; CHECK-O-NEXT: Running pass: LoopUnrollPass -; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass -; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Starting {{.*}}Function pass manager run ; CHECK-O-NEXT: Running pass: LoopSimplifyPass Index: llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll =================================================================== --- llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -202,6 +202,8 @@ ; CHECK-O-NEXT: Running pass: LoopDistributePass ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-O-NEXT: Running pass: LoopUnrollPass +; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running pass: InstCombinePass @@ -211,9 +213,6 @@ ; CHECK-Os-NEXT: Running pass: SLPVectorizerPass ; CHECK-O-NEXT: Running pass: VectorCombinePass ; CHECK-O-NEXT: Running pass: InstCombinePass -; CHECK-O-NEXT: Running pass: LoopUnrollPass -; CHECK-O-NEXT: Running pass: WarnMissedTransformationsPass -; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Starting {{.*}}Function pass manager run ; CHECK-O-NEXT: Running pass: LoopSimplifyPass Index: llvm/test/Other/opt-LTO-pipeline.ll =================================================================== --- llvm/test/Other/opt-LTO-pipeline.ll +++ llvm/test/Other/opt-LTO-pipeline.ll @@ -155,23 +155,22 @@ ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Warn about non-applied transformations -; CHECK-NEXT: Combine redundant instructions -; CHECK-NEXT: Simplify the CFG -; CHECK-NEXT: Sparse Conditional Constant Propagation -; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions -; CHECK-NEXT: Demanded bits analysis -; CHECK-NEXT: Bit-Tracking Dead Code Elimination +; CHECK-NEXT: Simplify the CFG +; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Optimize scalar/vector ops -; CHECK-NEXT: Scalar Evolution Analysis -; CHECK-NEXT: Alignment from assumptions +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Lazy Value Information Analysis Index: llvm/test/Other/opt-O2-pipeline.ll =================================================================== --- llvm/test/Other/opt-O2-pipeline.ll +++ llvm/test/Other/opt-O2-pipeline.ll @@ -228,11 +228,17 @@ ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Loop Vectorization ; CHECK-NEXT: Canonicalize natural loops -; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: LCSSA Verifier +; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Unroll loops ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Warn about non-applied transformations +; CHECK-NEXT: Loop Access Analysis ; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results @@ -255,16 +261,6 @@ ; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions -; CHECK-NEXT: Canonicalize natural loops -; CHECK-NEXT: LCSSA Verifier -; CHECK-NEXT: Loop-Closed SSA Form Pass -; CHECK-NEXT: Scalar Evolution Analysis -; CHECK-NEXT: Loop Pass Manager -; CHECK-NEXT: Unroll loops -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter -; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier Index: llvm/test/Other/opt-O3-pipeline-enable-matrix.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline-enable-matrix.ll +++ llvm/test/Other/opt-O3-pipeline-enable-matrix.ll @@ -240,11 +240,17 @@ ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Loop Vectorization ; CHECK-NEXT: Canonicalize natural loops -; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: LCSSA Verifier +; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Unroll loops ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Warn about non-applied transformations +; CHECK-NEXT: Loop Access Analysis ; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results @@ -267,16 +273,6 @@ ; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions -; CHECK-NEXT: Canonicalize natural loops -; CHECK-NEXT: LCSSA Verifier -; CHECK-NEXT: Loop-Closed SSA Form Pass -; CHECK-NEXT: Scalar Evolution Analysis -; CHECK-NEXT: Loop Pass Manager -; CHECK-NEXT: Unroll loops -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter -; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier Index: llvm/test/Other/opt-O3-pipeline.ll =================================================================== --- llvm/test/Other/opt-O3-pipeline.ll +++ llvm/test/Other/opt-O3-pipeline.ll @@ -234,11 +234,17 @@ ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Loop Vectorization ; CHECK-NEXT: Canonicalize natural loops -; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: LCSSA Verifier +; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Unroll loops ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Warn about non-applied transformations +; CHECK-NEXT: Loop Access Analysis ; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results @@ -261,16 +267,6 @@ ; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions -; CHECK-NEXT: Canonicalize natural loops -; CHECK-NEXT: LCSSA Verifier -; CHECK-NEXT: Loop-Closed SSA Form Pass -; CHECK-NEXT: Scalar Evolution Analysis -; CHECK-NEXT: Loop Pass Manager -; CHECK-NEXT: Unroll loops -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter -; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier @@ -468,15 +464,14 @@ ; NEWPM-NEXT: LoopVectorizePass on f ; NEWPM-NEXT: BlockFrequencyAnalysis analysis on f ; NEWPM-NEXT: BranchProbabilityAnalysis analysis on f +; NEWPM-NEXT: LoopUnrollPass on f +; NEWPM-NEXT: WarnMissedTransformationsPass on f ; NEWPM-NEXT: LoopLoadEliminationPass on f ; NEWPM-NEXT: InstCombinePass on f ; NEWPM-NEXT: SimplifyCFGPass on f ; NEWPM-NEXT: SLPVectorizerPass on f ; NEWPM-NEXT: VectorCombinePass on f ; NEWPM-NEXT: InstCombinePass on f -; NEWPM-NEXT: LoopUnrollPass on f -; NEWPM-NEXT: WarnMissedTransformationsPass on f -; NEWPM-NEXT: InstCombinePass on f ; NEWPM-NEXT: RequireAnalysisPass<{{.*}}> on f ; NEWPM-NEXT: FunctionToLoopPassAdaptor on f ; NEWPM-NEXT: PassManager<{{.*}}> on f Index: llvm/test/Other/opt-Os-pipeline.ll =================================================================== --- llvm/test/Other/opt-Os-pipeline.ll +++ llvm/test/Other/opt-Os-pipeline.ll @@ -214,11 +214,17 @@ ; CHECK-NEXT: Inject TLI Mappings ; CHECK-NEXT: Loop Vectorization ; CHECK-NEXT: Canonicalize natural loops -; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: LCSSA Verifier +; CHECK-NEXT: Loop-Closed SSA Form Pass ; CHECK-NEXT: Function Alias Analysis Results -; CHECK-NEXT: Loop Access Analysis +; CHECK-NEXT: Scalar Evolution Analysis +; CHECK-NEXT: Loop Pass Manager +; CHECK-NEXT: Unroll loops ; CHECK-NEXT: Lazy Branch Probability Analysis ; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter +; CHECK-NEXT: Warn about non-applied transformations +; CHECK-NEXT: Loop Access Analysis ; CHECK-NEXT: Loop Load Elimination ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results @@ -241,16 +247,6 @@ ; CHECK-NEXT: Optimize scalar/vector ops ; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Combine redundant instructions -; CHECK-NEXT: Canonicalize natural loops -; CHECK-NEXT: LCSSA Verifier -; CHECK-NEXT: Loop-Closed SSA Form Pass -; CHECK-NEXT: Scalar Evolution Analysis -; CHECK-NEXT: Loop Pass Manager -; CHECK-NEXT: Unroll loops -; CHECK-NEXT: Lazy Branch Probability Analysis -; CHECK-NEXT: Lazy Block Frequency Analysis -; CHECK-NEXT: Optimization Remark Emitter -; CHECK-NEXT: Combine redundant instructions ; CHECK-NEXT: Memory SSA ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: LCSSA Verifier Index: llvm/test/Other/pass-pipelines.ll =================================================================== --- llvm/test/Other/pass-pipelines.ll +++ llvm/test/Other/pass-pipelines.ll @@ -88,13 +88,12 @@ ; CHECK-O2-NEXT: Rotate Loops ; CHECK-O2-NOT: Manager ; CHECK-O2: Loop Vectorization -; CHECK-O2-NOT: Manager -; CHECK-O2: SLP Vectorizer -; CHECK-O2-NOT: Manager -; After vectorization we do partial unrolling. +; After loop vectorization we do partial unrolling. ; CHECK-O2: Loop Pass Manager ; CHECK-O2-NEXT: Unroll loops ; CHECK-O2-NOT: Manager +; CHECK-O2: SLP Vectorizer +; CHECK-O2-NOT: Manager ; After vectorization and unrolling we try to do any cleanup of inserted code, ; including a run of LICM. This shouldn't run in the same loop pass manager as ; the runtime unrolling though. Index: llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -28,119 +28,29 @@ ; AUTO_VEC-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float ; AUTO_VEC-NEXT: [[TMP0:%.*]] = fmul fast float [[CAST_CRD]], 5.000000e-01 ; AUTO_VEC-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP0]], 1.000000e+00 -; AUTO_VEC-NEXT: [[TMP1:%.*]] = add nsw i64 [[N_VEC]], -32 -; AUTO_VEC-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 5 -; AUTO_VEC-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP3]], 3 -; AUTO_VEC-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], 96 -; AUTO_VEC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], 1152921504606846972 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: -; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd fast <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd fast <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = bitcast float* [[TMP1]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND]], <8 x float>* [[TMP2]], align 4 +; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AUTO_VEC-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD]], <8 x float>* [[TMP4]], align 4 +; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 16 ; AUTO_VEC-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND]], <8 x float>* [[TMP6]], align 4 -; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 8 +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2]], <8 x float>* [[TMP6]], align 4 +; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 24 ; AUTO_VEC-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD]], <8 x float>* [[TMP8]], align 4 -; AUTO_VEC-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 16 -; AUTO_VEC-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2]], <8 x float>* [[TMP10]], align 4 -; AUTO_VEC-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP5]], i64 24 -; AUTO_VEC-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3]], <8 x float>* [[TMP12]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT]] -; AUTO_VEC-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_NEXT]], <8 x float>* [[TMP14]], align 4 -; AUTO_VEC-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 8 -; AUTO_VEC-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_1]], <8 x float>* [[TMP16]], align 4 -; AUTO_VEC-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 16 -; AUTO_VEC-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_1]], <8 x float>* [[TMP18]], align 4 -; AUTO_VEC-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP13]], i64 24 -; AUTO_VEC-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_1]], <8 x float>* [[TMP20]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 64 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_1:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT_1]] -; AUTO_VEC-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_NEXT_1]], <8 x float>* [[TMP22]], align 4 -; AUTO_VEC-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 8 -; AUTO_VEC-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_2]], <8 x float>* [[TMP24]], align 4 -; AUTO_VEC-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 16 -; AUTO_VEC-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_2]], <8 x float>* [[TMP26]], align 4 -; AUTO_VEC-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP21]], i64 24 -; AUTO_VEC-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_2]], <8 x float>* [[TMP28]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 96 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_2:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_3:%.*]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_NEXT_2]] -; AUTO_VEC-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_NEXT_2]], <8 x float>* [[TMP30]], align 4 -; AUTO_VEC-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 8 -; AUTO_VEC-NEXT: [[TMP32:%.*]] = bitcast float* [[TMP31]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_3]], <8 x float>* [[TMP32]], align 4 -; AUTO_VEC-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 16 -; AUTO_VEC-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP33]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_3]], <8 x float>* [[TMP34]], align 4 -; AUTO_VEC-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP29]], i64 24 -; AUTO_VEC-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP35]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_3]], <8 x float>* [[TMP36]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_3]] = add i64 [[INDEX]], 128 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_3]] = fadd fast <8 x float> [[VEC_IND]], -; AUTO_VEC-NEXT: [[NITER_NSUB_3]] = add i64 [[NITER]], -4 -; AUTO_VEC-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NSUB_3]], 0 -; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; AUTO_VEC: middle.block.unr-lcssa: -; AUTO_VEC-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND_UNR:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AUTO_VEC: vector.body.epil: -; AUTO_VEC-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[VEC_IND_EPIL:%.*]] = phi <8 x float> [ [[VEC_IND_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[VEC_IND_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[XTRAITER]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[STEP_ADD_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD2_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD3_EPIL:%.*]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX_EPIL]] -; AUTO_VEC-NEXT: [[TMP38:%.*]] = bitcast float* [[TMP37]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND_EPIL]], <8 x float>* [[TMP38]], align 4 -; AUTO_VEC-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 8 -; AUTO_VEC-NEXT: [[TMP40:%.*]] = bitcast float* [[TMP39]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD_EPIL]], <8 x float>* [[TMP40]], align 4 -; AUTO_VEC-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 16 -; AUTO_VEC-NEXT: [[TMP42:%.*]] = bitcast float* [[TMP41]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2_EPIL]], <8 x float>* [[TMP42]], align 4 -; AUTO_VEC-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP37]], i64 24 -; AUTO_VEC-NEXT: [[TMP44:%.*]] = bitcast float* [[TMP43]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3_EPIL]], <8 x float>* [[TMP44]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_EPIL]] = add i64 [[INDEX_EPIL]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_EPIL]] = fadd fast <8 x float> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 -; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP2:![0-9]+]] +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3]], <8 x float>* [[TMP8]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <8 x float> [[VEC_IND]], +; AUTO_VEC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AUTO_VEC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[ZEXT]] ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY]] @@ -151,8 +61,8 @@ ; AUTO_VEC-NEXT: store float [[X_06]], float* [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[CONV1]] = fadd fast float [[X_06]], 5.000000e-01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; AUTO_VEC-NEXT: [[TMP45:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] -; AUTO_VEC-NEXT: br i1 [[TMP45]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; AUTO_VEC-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[ZEXT]] +; AUTO_VEC-NEXT: br i1 [[TMP10]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: ret void ; @@ -259,7 +169,7 @@ ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1 ; AUTO_VEC-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 ; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP6:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP4:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: ret void ; @@ -298,124 +208,34 @@ ; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 ; AUTO_VEC-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to double ; AUTO_VEC-NEXT: [[TMP0:%.*]] = fmul fast double [[CAST_CRD]], 3.000000e+00 -; AUTO_VEC-NEXT: [[TMP1:%.*]] = add nsw i64 [[N_VEC]], -16 -; AUTO_VEC-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 4 -; AUTO_VEC-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 -; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP3]], 3 -; AUTO_VEC-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], 48 -; AUTO_VEC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], 2305843009213693948 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: -; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <4 x double> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = bitcast double* [[TMP1]] to <4 x double>* +; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND]], <4 x double>* [[TMP2]], align 8 +; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr double, double* [[TMP1]], i64 4 +; AUTO_VEC-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>* +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD]], <4 x double>* [[TMP4]], align 8 +; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr double, double* [[TMP1]], i64 8 ; AUTO_VEC-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP5]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND]], <4 x double>* [[TMP6]], align 8 -; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr double, double* [[TMP5]], i64 4 +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2]], <4 x double>* [[TMP6]], align 8 +; AUTO_VEC-NEXT: [[TMP7:%.*]] = getelementptr double, double* [[TMP1]], i64 12 ; AUTO_VEC-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD]], <4 x double>* [[TMP8]], align 8 -; AUTO_VEC-NEXT: [[TMP9:%.*]] = getelementptr double, double* [[TMP5]], i64 8 -; AUTO_VEC-NEXT: [[TMP10:%.*]] = bitcast double* [[TMP9]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2]], <4 x double>* [[TMP10]], align 8 -; AUTO_VEC-NEXT: [[TMP11:%.*]] = getelementptr double, double* [[TMP5]], i64 12 -; AUTO_VEC-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3]], <4 x double>* [[TMP12]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT]] -; AUTO_VEC-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_NEXT]], <4 x double>* [[TMP14]], align 8 -; AUTO_VEC-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[TMP13]], i64 4 -; AUTO_VEC-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_1]], <4 x double>* [[TMP16]], align 8 -; AUTO_VEC-NEXT: [[TMP17:%.*]] = getelementptr double, double* [[TMP13]], i64 8 -; AUTO_VEC-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_1]], <4 x double>* [[TMP18]], align 8 -; AUTO_VEC-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP13]], i64 12 -; AUTO_VEC-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_1]], <4 x double>* [[TMP20]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_1:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT_1]] -; AUTO_VEC-NEXT: [[TMP22:%.*]] = bitcast double* [[TMP21]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_NEXT_1]], <4 x double>* [[TMP22]], align 8 -; AUTO_VEC-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[TMP21]], i64 4 -; AUTO_VEC-NEXT: [[TMP24:%.*]] = bitcast double* [[TMP23]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_2]], <4 x double>* [[TMP24]], align 8 -; AUTO_VEC-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP21]], i64 8 -; AUTO_VEC-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_2]], <4 x double>* [[TMP26]], align 8 -; AUTO_VEC-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP21]], i64 12 -; AUTO_VEC-NEXT: [[TMP28:%.*]] = bitcast double* [[TMP27]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_2]], <4 x double>* [[TMP28]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 48 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_2:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD2_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[STEP_ADD3_3:%.*]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[TMP29:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_NEXT_2]] -; AUTO_VEC-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_NEXT_2]], <4 x double>* [[TMP30]], align 8 -; AUTO_VEC-NEXT: [[TMP31:%.*]] = getelementptr double, double* [[TMP29]], i64 4 -; AUTO_VEC-NEXT: [[TMP32:%.*]] = bitcast double* [[TMP31]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_3]], <4 x double>* [[TMP32]], align 8 -; AUTO_VEC-NEXT: [[TMP33:%.*]] = getelementptr double, double* [[TMP29]], i64 8 -; AUTO_VEC-NEXT: [[TMP34:%.*]] = bitcast double* [[TMP33]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_3]], <4 x double>* [[TMP34]], align 8 -; AUTO_VEC-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP29]], i64 12 -; AUTO_VEC-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_3]], <4 x double>* [[TMP36]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_3]] = add i64 [[INDEX]], 64 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_3]] = fadd fast <4 x double> [[VEC_IND]], -; AUTO_VEC-NEXT: [[NITER_NSUB_3]] = add i64 [[NITER]], -4 -; AUTO_VEC-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NSUB_3]], 0 -; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; AUTO_VEC: middle.block.unr-lcssa: -; AUTO_VEC-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND_UNR:%.*]] = phi <4 x double> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_3]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AUTO_VEC: vector.body.epil: -; AUTO_VEC-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[VEC_IND_EPIL:%.*]] = phi <4 x double> [ [[VEC_IND_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[VEC_IND_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[XTRAITER]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[STEP_ADD_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD2_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD3_EPIL:%.*]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[A]], i64 [[INDEX_EPIL]] -; AUTO_VEC-NEXT: [[TMP38:%.*]] = bitcast double* [[TMP37]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND_EPIL]], <4 x double>* [[TMP38]], align 8 -; AUTO_VEC-NEXT: [[TMP39:%.*]] = getelementptr double, double* [[TMP37]], i64 4 -; AUTO_VEC-NEXT: [[TMP40:%.*]] = bitcast double* [[TMP39]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD_EPIL]], <4 x double>* [[TMP40]], align 8 -; AUTO_VEC-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP37]], i64 8 -; AUTO_VEC-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2_EPIL]], <4 x double>* [[TMP42]], align 8 -; AUTO_VEC-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP37]], i64 12 -; AUTO_VEC-NEXT: [[TMP44:%.*]] = bitcast double* [[TMP43]] to <4 x double>* -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3_EPIL]], <4 x double>* [[TMP44]], align 8 -; AUTO_VEC-NEXT: [[INDEX_NEXT_EPIL]] = add i64 [[INDEX_EPIL]], 16 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_EPIL]] = fadd fast <4 x double> [[VEC_IND_EPIL]], -; AUTO_VEC-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 -; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP8:![0-9]+]] +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3]], <4 x double>* [[TMP8]], align 8 +; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x double> [[VEC_IND]], +; AUTO_VEC-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AUTO_VEC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] -; AUTO_VEC-NEXT: [[TMP45:%.*]] = add nsw i64 [[N_VEC]], -1 -; AUTO_VEC-NEXT: [[CAST_CMO:%.*]] = sitofp i64 [[TMP45]] to double -; AUTO_VEC-NEXT: [[TMP46:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00 +; AUTO_VEC-NEXT: [[TMP10:%.*]] = add nsw i64 [[N_VEC]], -1 +; AUTO_VEC-NEXT: [[CAST_CMO:%.*]] = sitofp i64 [[TMP10]] to double +; AUTO_VEC-NEXT: [[TMP11:%.*]] = fmul fast double [[CAST_CMO]], 3.000000e+00 ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AUTO_VEC: for.body: ; AUTO_VEC-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] @@ -425,9 +245,9 @@ ; AUTO_VEC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; AUTO_VEC-NEXT: [[J_NEXT]] = fadd fast double [[J]], 3.000000e+00 ; AUTO_VEC-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[SMAX]] -; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; AUTO_VEC: for.end: -; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[TMP46]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ] +; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[J]], [[FOR_BODY]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] ; entry: @@ -514,7 +334,7 @@ ; AUTO_VEC-NEXT: [[J_NEXT_EPIL]] = fadd double [[J_EPIL]], 3.000000e+00 ; AUTO_VEC-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 ; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP10:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP8:![0-9]+]] ; AUTO_VEC: for.end: ; AUTO_VEC-NEXT: [[J_LCSSA:%.*]] = phi double [ [[J_LCSSA_PH]], [[FOR_END_UNR_LCSSA]] ], [ [[J_EPIL]], [[FOR_BODY_EPIL]] ] ; AUTO_VEC-NEXT: ret double [[J_LCSSA]] @@ -559,114 +379,41 @@ ; AUTO_VEC-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float ; AUTO_VEC-NEXT: [[TMP1:%.*]] = fmul reassoc float [[CAST_CRD]], 4.200000e+01 ; AUTO_VEC-NEXT: [[IND_END:%.*]] = fadd reassoc float [[TMP1]], 1.000000e+00 -; AUTO_VEC-NEXT: [[TMP2:%.*]] = add nsw i64 [[N_VEC]], -32 -; AUTO_VEC-NEXT: [[TMP3:%.*]] = lshr exact i64 [[TMP2]], 5 -; AUTO_VEC-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 -; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP4]], 1 -; AUTO_VEC-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP2]], 0 -; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AUTO_VEC: vector.ph.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP4]], 1152921504606846974 ; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] ; AUTO_VEC: vector.body: -; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_1:%.*]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], -; AUTO_VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4 +; AUTO_VEC-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 8 +; AUTO_VEC-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP5]], align 4 +; AUTO_VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 16 ; AUTO_VEC-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4 -; AUTO_VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 8 +; AUTO_VEC-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4 +; AUTO_VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 24 ; AUTO_VEC-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 -; AUTO_VEC-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 16 -; AUTO_VEC-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 -; AUTO_VEC-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 24 -; AUTO_VEC-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 -; AUTO_VEC-NEXT: [[TMP14:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], [[WIDE_LOAD]] -; AUTO_VEC-NEXT: [[TMP15:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD5]] -; AUTO_VEC-NEXT: [[TMP16:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], [[WIDE_LOAD6]] -; AUTO_VEC-NEXT: [[TMP17:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], [[WIDE_LOAD7]] -; AUTO_VEC-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP14]], <8 x float>* [[TMP18]], align 4 -; AUTO_VEC-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP8]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP15]], <8 x float>* [[TMP19]], align 4 -; AUTO_VEC-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP10]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP16]], <8 x float>* [[TMP20]], align 4 -; AUTO_VEC-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP12]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP17]], <8 x float>* [[TMP21]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], -; AUTO_VEC-NEXT: [[STEP_ADD_1:%.*]] = fadd reassoc <8 x float> [[VEC_IND_NEXT]], -; AUTO_VEC-NEXT: [[STEP_ADD2_1:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_1]], -; AUTO_VEC-NEXT: [[STEP_ADD3_1:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_1]], -; AUTO_VEC-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDEX_NEXT]] -; AUTO_VEC-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4 -; AUTO_VEC-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 8 -; AUTO_VEC-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD5_1:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4 -; AUTO_VEC-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 16 -; AUTO_VEC-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD6_1:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4 -; AUTO_VEC-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 24 -; AUTO_VEC-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD7_1:%.*]] = load <8 x float>, <8 x float>* [[TMP29]], align 4 -; AUTO_VEC-NEXT: [[TMP30:%.*]] = fadd reassoc <8 x float> [[VEC_IND_NEXT]], [[WIDE_LOAD_1]] -; AUTO_VEC-NEXT: [[TMP31:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_1]], [[WIDE_LOAD5_1]] -; AUTO_VEC-NEXT: [[TMP32:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_1]], [[WIDE_LOAD6_1]] -; AUTO_VEC-NEXT: [[TMP33:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3_1]], [[WIDE_LOAD7_1]] -; AUTO_VEC-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP22]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP30]], <8 x float>* [[TMP34]], align 4 -; AUTO_VEC-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP24]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP31]], <8 x float>* [[TMP35]], align 4 -; AUTO_VEC-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP32]], <8 x float>* [[TMP36]], align 4 -; AUTO_VEC-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP33]], <8 x float>* [[TMP37]], align 4 -; AUTO_VEC-NEXT: [[INDEX_NEXT_1]] = add i64 [[INDEX]], 64 -; AUTO_VEC-NEXT: [[VEC_IND_NEXT_1]] = fadd reassoc <8 x float> [[STEP_ADD3_1]], -; AUTO_VEC-NEXT: [[NITER_NSUB_1]] = add i64 [[NITER]], -2 -; AUTO_VEC-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NSUB_1]], 0 -; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_1]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] -; AUTO_VEC: middle.block.unr-lcssa: -; AUTO_VEC-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_1]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[VEC_IND_UNR:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_1]], [[VECTOR_BODY]] ] -; AUTO_VEC-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AUTO_VEC: vector.body.epil: -; AUTO_VEC-NEXT: [[STEP_ADD_EPIL:%.*]] = fadd reassoc <8 x float> [[VEC_IND_UNR]], -; AUTO_VEC-NEXT: [[STEP_ADD2_EPIL:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_EPIL]], -; AUTO_VEC-NEXT: [[STEP_ADD3_EPIL:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_EPIL]], -; AUTO_VEC-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDEX_UNR]] -; AUTO_VEC-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP39]], align 4 -; AUTO_VEC-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 8 -; AUTO_VEC-NEXT: [[TMP41:%.*]] = bitcast float* [[TMP40]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD5_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP41]], align 4 -; AUTO_VEC-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 16 -; AUTO_VEC-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD6_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP43]], align 4 -; AUTO_VEC-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 24 -; AUTO_VEC-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>* -; AUTO_VEC-NEXT: [[WIDE_LOAD7_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP45]], align 4 -; AUTO_VEC-NEXT: [[TMP46:%.*]] = fadd reassoc <8 x float> [[VEC_IND_UNR]], [[WIDE_LOAD_EPIL]] -; AUTO_VEC-NEXT: [[TMP47:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_EPIL]], [[WIDE_LOAD5_EPIL]] -; AUTO_VEC-NEXT: [[TMP48:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_EPIL]], [[WIDE_LOAD6_EPIL]] -; AUTO_VEC-NEXT: [[TMP49:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3_EPIL]], [[WIDE_LOAD7_EPIL]] -; AUTO_VEC-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP38]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP46]], <8 x float>* [[TMP50]], align 4 -; AUTO_VEC-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP40]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP47]], <8 x float>* [[TMP51]], align 4 -; AUTO_VEC-NEXT: [[TMP52:%.*]] = bitcast float* [[TMP42]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP48]], <8 x float>* [[TMP52]], align 4 -; AUTO_VEC-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP44]] to <8 x float>* -; AUTO_VEC-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP53]], align 4 -; AUTO_VEC-NEXT: br label [[MIDDLE_BLOCK]] +; AUTO_VEC-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; AUTO_VEC-NEXT: [[TMP10:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], [[WIDE_LOAD]] +; AUTO_VEC-NEXT: [[TMP11:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD5]] +; AUTO_VEC-NEXT: [[TMP12:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], [[WIDE_LOAD6]] +; AUTO_VEC-NEXT: [[TMP13:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], [[WIDE_LOAD7]] +; AUTO_VEC-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP2]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP10]], <8 x float>* [[TMP14]], align 4 +; AUTO_VEC-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP4]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP11]], <8 x float>* [[TMP15]], align 4 +; AUTO_VEC-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP12]], <8 x float>* [[TMP16]], align 4 +; AUTO_VEC-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP13]], <8 x float>* [[TMP17]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd reassoc <8 x float> [[STEP_ADD3]], +; AUTO_VEC-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AUTO_VEC-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; AUTO_VEC: middle.block: ; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] ; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] @@ -676,13 +423,13 @@ ; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV]] -; AUTO_VEC-NEXT: [[TMP54:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP54]] +; AUTO_VEC-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP19]] ; AUTO_VEC-NEXT: store float [[ADD]], float* [[ARRAYIDX]], align 4 ; AUTO_VEC-NEXT: [[ADD3]] = fadd reassoc float [[X_012]], 4.200000e+01 ; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AUTO_VEC-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]] -; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; entry: %cmp.not11 = icmp eq i32 %N, 0 Index: llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -231,134 +231,32 @@ define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { ; AVX512-LABEL: @foo2( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[VEC_IND_NEXT:%.*]] = add <16 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> [[VEC_IND_NEXT]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> [[VEC_IND_NEXT]], i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> [[VEC_IND_NEXT]] ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer -; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], -; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) -; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer -; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], -; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) -; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer -; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], -; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX6]], 32 +; AVX512-NEXT: [[VEC_IND_NEXT_1]] = add <16 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 256 +; AVX512-NEXT: br i1 [[TMP10]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; AVX512: for.end: ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2( @@ -547,134 +445,32 @@ define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { ; AVX512-LABEL: @foo3( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6]], -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[VEC_IND_NEXT:%.*]] = add <16 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> [[VEC_IND_NEXT]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> [[VEC_IND_NEXT]], i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER6_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_1]], -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> [[VEC_IND_NEXT]], i32 1 ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_2]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_3]], -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_4]], -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_5]], -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_6]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_7]], -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_8]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_9]], -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_10]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_11]], -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_12]], -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer -; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_13]], -; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) -; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer -; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_14]], -; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) -; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer -; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER6_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_15]], -; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 32 +; AVX512-NEXT: [[VEC_IND_NEXT_1]] = add <16 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 256 +; AVX512-NEXT: br i1 [[TMP10]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX512: for.end: ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo3( @@ -849,134 +645,32 @@ define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { ; AVX512-LABEL: @foo2_addrspace( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[VEC_IND_NEXT:%.*]] = add <16 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> [[VEC_IND_NEXT]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> [[VEC_IND_NEXT]], i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> [[VEC_IND_NEXT]] ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer -; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], -; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) -; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer -; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], -; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) -; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer -; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], -; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX6]], 32 +; AVX512-NEXT: [[VEC_IND_NEXT_1]] = add <16 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 256 +; AVX512-NEXT: br i1 [[TMP10]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; AVX512: for.end: ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2_addrspace( @@ -1151,134 +845,32 @@ define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo2_addrspace2( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[VEC_IND_NEXT:%.*]] = add <16 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> [[VEC_IND_NEXT]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> [[VEC_IND_NEXT]], i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> [[VEC_IND_NEXT]] ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer -; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], -; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) -; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer -; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], -; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) -; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer -; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], -; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX6]], 32 +; AVX512-NEXT: [[VEC_IND_NEXT_1]] = add <16 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 256 +; AVX512-NEXT: br i1 [[TMP10]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; AVX512: for.end: ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2_addrspace2( @@ -1453,134 +1045,32 @@ define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) { ; AVX512-LABEL: @foo2_addrspace3( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer -; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) ; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]] ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[VEC_IND_NEXT:%.*]] = add <16 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> [[VEC_IND_NEXT]] ; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) ; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> [[VEC_IND_NEXT]], i32 1 ; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) ; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], -; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> [[VEC_IND_NEXT]] ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) -; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) -; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) -; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], -; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) -; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) -; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) -; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], -; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) -; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], -; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) -; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], -; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) -; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) -; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) -; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], -; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) -; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) -; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) -; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer -; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) -; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) -; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer -; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) -; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], -; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) -; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer -; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) -; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], -; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) -; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> -; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) -; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer -; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 -; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) -; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], -; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX6]], 32 +; AVX512-NEXT: [[VEC_IND_NEXT_1]] = add <16 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 256 +; AVX512-NEXT: br i1 [[TMP10]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; AVX512: for.end: ; AVX512-NEXT: ret void ; ; FVW2-LABEL: @foo2_addrspace3( @@ -1793,142 +1283,24 @@ ; AVX512-NEXT: [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]] ; AVX512-NEXT: [[TMP12:%.*]] = shl i64 [[N_VEC]], 4 ; AVX512-NEXT: [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]] -; AVX512-NEXT: [[TMP13:%.*]] = add nsw i64 [[N_VEC]], -16 -; AVX512-NEXT: [[TMP14:%.*]] = lshr exact i64 [[TMP13]], 4 -; AVX512-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[TMP14]], 1 -; AVX512-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP15]], 7 -; AVX512-NEXT: [[TMP16:%.*]] = icmp ult i64 [[TMP13]], 112 -; AVX512-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; AVX512: vector.ph.new: -; AVX512-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP15]], 2305843009213693944 ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: -; AVX512-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH_NEW]] ], [ [[PTR_IND_7:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_7:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_7:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; AVX512-NEXT: [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr float, float* [[POINTER_PHI]], <16 x i64> -; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP19]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD]], <16 x float*> [[TMP17]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP20:%.*]] = bitcast float* [[NEXT_GEP]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x float>, <16 x float>* [[TMP20]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP17]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15]], <16 x float*> [[TMP21]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 16 -; AVX512-NEXT: [[PTR_IND:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 256 -; AVX512-NEXT: [[NEXT_GEP_1:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT]] -; AVX512-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[PTR_IND]], <16 x i64> -; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_1]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x float>, <16 x float>* [[TMP24]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_1]], <16 x float*> [[TMP22]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[NEXT_GEP_1]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_1:%.*]] = load <16 x float>, <16 x float>* [[TMP25]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP22]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_1]], <16 x float*> [[TMP26]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 32 -; AVX512-NEXT: [[PTR_IND_1:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 512 -; AVX512-NEXT: [[NEXT_GEP_2:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_1]] -; AVX512-NEXT: [[TMP27:%.*]] = getelementptr float, float* [[PTR_IND_1]], <16 x i64> -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_2]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x float>, <16 x float>* [[TMP29]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_2]], <16 x float*> [[TMP27]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP30:%.*]] = bitcast float* [[NEXT_GEP_2]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_2:%.*]] = load <16 x float>, <16 x float>* [[TMP30]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP27]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_2]], <16 x float*> [[TMP31]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 48 -; AVX512-NEXT: [[PTR_IND_2:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 768 -; AVX512-NEXT: [[NEXT_GEP_3:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_2]] -; AVX512-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[PTR_IND_2]], <16 x i64> -; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_3]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP33]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_3:%.*]] = load <16 x float>, <16 x float>* [[TMP34]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_3]], <16 x float*> [[TMP32]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP35:%.*]] = bitcast float* [[NEXT_GEP_3]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_3:%.*]] = load <16 x float>, <16 x float>* [[TMP35]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP32]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_3]], <16 x float*> [[TMP36]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_3:%.*]] = or i64 [[INDEX]], 64 -; AVX512-NEXT: [[PTR_IND_3:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1024 -; AVX512-NEXT: [[NEXT_GEP_4:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_3]] -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr float, float* [[PTR_IND_3]], <16 x i64> -; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_4]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_4:%.*]] = load <16 x float>, <16 x float>* [[TMP39]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_4]], <16 x float*> [[TMP37]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP40:%.*]] = bitcast float* [[NEXT_GEP_4]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_4:%.*]] = load <16 x float>, <16 x float>* [[TMP40]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP37]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_4]], <16 x float*> [[TMP41]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_4:%.*]] = or i64 [[INDEX]], 80 -; AVX512-NEXT: [[PTR_IND_4:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1280 -; AVX512-NEXT: [[NEXT_GEP_5:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_4]] -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[PTR_IND_4]], <16 x i64> -; AVX512-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_5]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP44:%.*]] = bitcast float* [[TMP43]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_5:%.*]] = load <16 x float>, <16 x float>* [[TMP44]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_5]], <16 x float*> [[TMP42]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP45:%.*]] = bitcast float* [[NEXT_GEP_5]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_5:%.*]] = load <16 x float>, <16 x float>* [[TMP45]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP42]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_5]], <16 x float*> [[TMP46]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_5:%.*]] = or i64 [[INDEX]], 96 -; AVX512-NEXT: [[PTR_IND_5:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1536 -; AVX512-NEXT: [[NEXT_GEP_6:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_5]] -; AVX512-NEXT: [[TMP47:%.*]] = getelementptr float, float* [[PTR_IND_5]], <16 x i64> -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_6]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_6:%.*]] = load <16 x float>, <16 x float>* [[TMP49]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_6]], <16 x float*> [[TMP47]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP50:%.*]] = bitcast float* [[NEXT_GEP_6]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_6:%.*]] = load <16 x float>, <16 x float>* [[TMP50]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP47]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_6]], <16 x float*> [[TMP51]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_6:%.*]] = or i64 [[INDEX]], 112 -; AVX512-NEXT: [[PTR_IND_6:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 1792 -; AVX512-NEXT: [[NEXT_GEP_7:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_6]] -; AVX512-NEXT: [[TMP52:%.*]] = getelementptr float, float* [[PTR_IND_6]], <16 x i64> -; AVX512-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_7]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP53]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_7:%.*]] = load <16 x float>, <16 x float>* [[TMP54]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_7]], <16 x float*> [[TMP52]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP55:%.*]] = bitcast float* [[NEXT_GEP_7]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_7:%.*]] = load <16 x float>, <16 x float>* [[TMP55]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP52]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_7]], <16 x float*> [[TMP56]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_7]] = add i64 [[INDEX]], 128 -; AVX512-NEXT: [[PTR_IND_7]] = getelementptr float, float* [[POINTER_PHI]], i64 2048 -; AVX512-NEXT: [[NITER_NSUB_7]] = add i64 [[NITER]], -8 -; AVX512-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NSUB_7]], 0 -; AVX512-NEXT: br i1 [[NITER_NCMP_7]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] -; AVX512: middle.block.unr-lcssa: -; AVX512-NEXT: [[POINTER_PHI_UNR:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND_7]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_7]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AVX512-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; AVX512: vector.body.epil: -; AVX512-NEXT: [[POINTER_PHI_EPIL:%.*]] = phi float* [ [[PTR_IND_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[POINTER_PHI_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AVX512-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AVX512-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[XTRAITER]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; AVX512-NEXT: [[NEXT_GEP_EPIL:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_EPIL]] -; AVX512-NEXT: [[TMP57:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <16 x i64> -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_EPIL]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP59:%.*]] = bitcast float* [[TMP58]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <16 x float>, <16 x float>* [[TMP59]], align 4, !alias.scope !2 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD_EPIL]], <16 x float*> [[TMP57]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[TMP60:%.*]] = bitcast float* [[NEXT_GEP_EPIL]] to <16 x float>* -; AVX512-NEXT: [[WIDE_LOAD15_EPIL:%.*]] = load <16 x float>, <16 x float>* [[TMP60]], align 4, !alias.scope !9 -; AVX512-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP57]], i64 1 -; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15_EPIL]], <16 x float*> [[TMP61]], i32 4, <16 x i1> ), !alias.scope !5, !noalias !7 -; AVX512-NEXT: [[INDEX_NEXT_EPIL]] = add i64 [[INDEX_EPIL]], 16 -; AVX512-NEXT: [[PTR_IND_EPIL]] = getelementptr float, float* [[POINTER_PHI_EPIL]], i64 256 -; AVX512-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 -; AVX512-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 -; AVX512-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP11:![0-9]+]] +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr float, float* [[POINTER_PHI]], <16 x i64> +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] +; AVX512-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP15]], align 4, !alias.scope !7 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD]], <16 x float*> [[TMP13]], i32 4, <16 x i1> ), !alias.scope !10, !noalias !12 +; AVX512-NEXT: [[TMP16:%.*]] = bitcast float* [[NEXT_GEP]] to <16 x float>* +; AVX512-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x float>, <16 x float>* [[TMP16]], align 4, !alias.scope !14 +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, <16 x float*> [[TMP13]], i64 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[WIDE_LOAD15]], <16 x float*> [[TMP17]], i32 4, <16 x i1> ), !alias.scope !10, !noalias !12 +; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; AVX512-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i64 256 +; AVX512-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]] @@ -1940,15 +1312,15 @@ ; AVX512-NEXT: [[PTR_ADDR_012:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[PTR_ADDR_012_PH]], [[FOR_BODY_PREHEADER]] ] ; AVX512-NEXT: [[DEST_ADDR_011:%.*]] = phi float* [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ], [ [[DEST_ADDR_011_PH]], [[FOR_BODY_PREHEADER]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]] -; AVX512-NEXT: [[TMP62:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; AVX512-NEXT: store float [[TMP62]], float* [[DEST_ADDR_011]], align 4 -; AVX512-NEXT: [[TMP63:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 +; AVX512-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; AVX512-NEXT: store float [[TMP19]], float* [[DEST_ADDR_011]], align 4 +; AVX512-NEXT: [[TMP20:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 ; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1 -; AVX512-NEXT: store float [[TMP63]], float* [[ARRAYIDX5]], align 4 +; AVX512-NEXT: store float [[TMP20]], float* [[ARRAYIDX5]], align 4 ; AVX512-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1 ; AVX512-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16 ; AVX512-NEXT: [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]] -; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1993,144 +1365,34 @@ ; FVW2-NEXT: [[IND_END:%.*]] = getelementptr float, float* [[PTR]], i64 [[N_VEC]] ; FVW2-NEXT: [[TMP12:%.*]] = shl i64 [[N_VEC]], 4 ; FVW2-NEXT: [[IND_END14:%.*]] = getelementptr float, float* [[DEST]], i64 [[TMP12]] -; FVW2-NEXT: [[TMP13:%.*]] = add nsw i64 [[N_VEC]], -4 -; FVW2-NEXT: [[TMP14:%.*]] = lshr exact i64 [[TMP13]], 2 -; FVW2-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[TMP14]], 1 -; FVW2-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP15]], 3 -; FVW2-NEXT: [[TMP16:%.*]] = icmp ult i64 [[TMP13]], 12 -; FVW2-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; FVW2: vector.ph.new: -; FVW2-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP15]], 9223372036854775804 ; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] ; FVW2: vector.body: -; FVW2-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH_NEW]] ], [ [[PTR_IND_3:%.*]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[POINTER_PHI:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; FVW2-NEXT: [[NEXT_GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]] -; FVW2-NEXT: [[TMP17:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> -; FVW2-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> -; FVW2-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP20]], align 4, !alias.scope !7 -; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP19]], i64 2 -; FVW2-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD16:%.*]] = load <2 x float>, <2 x float>* [[TMP22]], align 4, !alias.scope !7 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD]], <2 x float*> [[TMP17]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16]], <2 x float*> [[TMP18]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[TMP23:%.*]] = bitcast float* [[NEXT_GEP]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD17:%.*]] = load <2 x float>, <2 x float>* [[TMP23]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[NEXT_GEP]], i64 2 -; FVW2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x float>, <2 x float>* [[TMP25]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP17]], i64 1 -; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP18]], i64 1 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17]], <2 x float*> [[TMP26]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18]], <2 x float*> [[TMP27]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 4 -; FVW2-NEXT: [[PTR_IND:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 64 -; FVW2-NEXT: [[NEXT_GEP_1:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT]] -; FVW2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[PTR_IND]], <2 x i64> -; FVW2-NEXT: [[TMP29:%.*]] = getelementptr float, float* [[PTR_IND]], <2 x i64> -; FVW2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_1]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x float>, <2 x float>* [[TMP31]], align 4, !alias.scope !7 -; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP30]], i64 2 -; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD16_1:%.*]] = load <2 x float>, <2 x float>* [[TMP33]], align 4, !alias.scope !7 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_1]], <2 x float*> [[TMP28]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_1]], <2 x float*> [[TMP29]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[TMP34:%.*]] = bitcast float* [[NEXT_GEP_1]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD17_1:%.*]] = load <2 x float>, <2 x float>* [[TMP34]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP35:%.*]] = getelementptr float, float* [[NEXT_GEP_1]], i64 2 -; FVW2-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP35]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD18_1:%.*]] = load <2 x float>, <2 x float>* [[TMP36]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP28]], i64 1 -; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP29]], i64 1 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_1]], <2 x float*> [[TMP37]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_1]], <2 x float*> [[TMP38]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 8 -; FVW2-NEXT: [[PTR_IND_1:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 128 -; FVW2-NEXT: [[NEXT_GEP_2:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_1]] -; FVW2-NEXT: [[TMP39:%.*]] = getelementptr float, float* [[PTR_IND_1]], <2 x i64> -; FVW2-NEXT: [[TMP40:%.*]] = getelementptr float, float* [[PTR_IND_1]], <2 x i64> -; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_2]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP42:%.*]] = bitcast float* [[TMP41]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x float>, <2 x float>* [[TMP42]], align 4, !alias.scope !7 -; FVW2-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP41]], i64 2 -; FVW2-NEXT: [[TMP44:%.*]] = bitcast float* [[TMP43]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD16_2:%.*]] = load <2 x float>, <2 x float>* [[TMP44]], align 4, !alias.scope !7 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_2]], <2 x float*> [[TMP39]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_2]], <2 x float*> [[TMP40]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[TMP45:%.*]] = bitcast float* [[NEXT_GEP_2]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD17_2:%.*]] = load <2 x float>, <2 x float>* [[TMP45]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[NEXT_GEP_2]], i64 2 -; FVW2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD18_2:%.*]] = load <2 x float>, <2 x float>* [[TMP47]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP39]], i64 1 -; FVW2-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP40]], i64 1 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_2]], <2 x float*> [[TMP48]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_2]], <2 x float*> [[TMP49]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 12 -; FVW2-NEXT: [[PTR_IND_2:%.*]] = getelementptr float, float* [[POINTER_PHI]], i64 192 -; FVW2-NEXT: [[NEXT_GEP_3:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_NEXT_2]] -; FVW2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[PTR_IND_2]], <2 x i64> -; FVW2-NEXT: [[TMP51:%.*]] = getelementptr float, float* [[PTR_IND_2]], <2 x i64> -; FVW2-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_3]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP52]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x float>, <2 x float>* [[TMP53]], align 4, !alias.scope !7 -; FVW2-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP52]], i64 2 -; FVW2-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD16_3:%.*]] = load <2 x float>, <2 x float>* [[TMP55]], align 4, !alias.scope !7 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_3]], <2 x float*> [[TMP50]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_3]], <2 x float*> [[TMP51]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[TMP56:%.*]] = bitcast float* [[NEXT_GEP_3]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD17_3:%.*]] = load <2 x float>, <2 x float>* [[TMP56]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP57:%.*]] = getelementptr float, float* [[NEXT_GEP_3]], i64 2 -; FVW2-NEXT: [[TMP58:%.*]] = bitcast float* [[TMP57]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD18_3:%.*]] = load <2 x float>, <2 x float>* [[TMP58]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP50]], i64 1 -; FVW2-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP51]], i64 1 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_3]], <2 x float*> [[TMP59]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_3]], <2 x float*> [[TMP60]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[INDEX_NEXT_3]] = add i64 [[INDEX]], 16 -; FVW2-NEXT: [[PTR_IND_3]] = getelementptr float, float* [[POINTER_PHI]], i64 256 -; FVW2-NEXT: [[NITER_NSUB_3]] = add i64 [[NITER]], -4 -; FVW2-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NSUB_3]], 0 -; FVW2-NEXT: br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] -; FVW2: middle.block.unr-lcssa: -; FVW2-NEXT: [[POINTER_PHI_UNR:%.*]] = phi float* [ [[DEST]], [[VECTOR_PH]] ], [ [[PTR_IND_3]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ] -; FVW2-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; FVW2-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] -; FVW2: vector.body.epil: -; FVW2-NEXT: [[POINTER_PHI_EPIL:%.*]] = phi float* [ [[PTR_IND_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[POINTER_PHI_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; FVW2-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[INDEX_UNR]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; FVW2-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ], [ [[XTRAITER]], [[MIDDLE_BLOCK_UNR_LCSSA]] ] -; FVW2-NEXT: [[NEXT_GEP_EPIL:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX_EPIL]] -; FVW2-NEXT: [[TMP61:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <2 x i64> -; FVW2-NEXT: [[TMP62:%.*]] = getelementptr float, float* [[POINTER_PHI_EPIL]], <2 x i64> -; FVW2-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP_EPIL]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP64:%.*]] = bitcast float* [[TMP63]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP64]], align 4, !alias.scope !7 -; FVW2-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP63]], i64 2 -; FVW2-NEXT: [[TMP66:%.*]] = bitcast float* [[TMP65]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD16_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP66]], align 4, !alias.scope !7 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD_EPIL]], <2 x float*> [[TMP61]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16_EPIL]], <2 x float*> [[TMP62]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[TMP67:%.*]] = bitcast float* [[NEXT_GEP_EPIL]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD17_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP67]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP68:%.*]] = getelementptr float, float* [[NEXT_GEP_EPIL]], i64 2 -; FVW2-NEXT: [[TMP69:%.*]] = bitcast float* [[TMP68]] to <2 x float>* -; FVW2-NEXT: [[WIDE_LOAD18_EPIL:%.*]] = load <2 x float>, <2 x float>* [[TMP69]], align 4, !alias.scope !14 -; FVW2-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP61]], i64 1 -; FVW2-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP62]], i64 1 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17_EPIL]], <2 x float*> [[TMP70]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18_EPIL]], <2 x float*> [[TMP71]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 -; FVW2-NEXT: [[INDEX_NEXT_EPIL]] = add i64 [[INDEX_EPIL]], 4 -; FVW2-NEXT: [[PTR_IND_EPIL]] = getelementptr float, float* [[POINTER_PHI_EPIL]], i64 64 -; FVW2-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 -; FVW2-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 -; FVW2-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP16:![0-9]+]] +; FVW2-NEXT: [[TMP13:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr float, float* [[POINTER_PHI]], <2 x i64> +; FVW2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[NEXT_GEP]], i64 [[IDXPROM]] +; FVW2-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <2 x float>* +; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP16]], align 4, !alias.scope !7 +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP15]], i64 2 +; FVW2-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <2 x float>* +; FVW2-NEXT: [[WIDE_LOAD16:%.*]] = load <2 x float>, <2 x float>* [[TMP18]], align 4, !alias.scope !7 +; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD]], <2 x float*> [[TMP13]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 +; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD16]], <2 x float*> [[TMP14]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 +; FVW2-NEXT: [[TMP19:%.*]] = bitcast float* [[NEXT_GEP]] to <2 x float>* +; FVW2-NEXT: [[WIDE_LOAD17:%.*]] = load <2 x float>, <2 x float>* [[TMP19]], align 4, !alias.scope !14 +; FVW2-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[NEXT_GEP]], i64 2 +; FVW2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <2 x float>* +; FVW2-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x float>, <2 x float>* [[TMP21]], align 4, !alias.scope !14 +; FVW2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP13]], i64 1 +; FVW2-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, <2 x float*> [[TMP14]], i64 1 +; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD17]], <2 x float*> [[TMP22]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 +; FVW2-NEXT: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> [[WIDE_LOAD18]], <2 x float*> [[TMP23]], i32 4, <2 x i1> ), !alias.scope !10, !noalias !12 +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; FVW2-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FVW2-NEXT: [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i64 64 +; FVW2-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; FVW2: middle.block: ; FVW2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; FVW2-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]] @@ -2142,15 +1404,15 @@ ; FVW2-NEXT: [[PTR_ADDR_012:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[PTR_ADDR_012_PH]], [[FOR_BODY_PREHEADER]] ] ; FVW2-NEXT: [[DEST_ADDR_011:%.*]] = phi float* [ [[ADD_PTR6:%.*]], [[FOR_BODY]] ], [ [[DEST_ADDR_011_PH]], [[FOR_BODY_PREHEADER]] ] ; FVW2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 [[IDXPROM]] -; FVW2-NEXT: [[TMP72:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; FVW2-NEXT: store float [[TMP72]], float* [[DEST_ADDR_011]], align 4 -; FVW2-NEXT: [[TMP73:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 +; FVW2-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; FVW2-NEXT: store float [[TMP25]], float* [[DEST_ADDR_011]], align 4 +; FVW2-NEXT: [[TMP26:%.*]] = load float, float* [[PTR_ADDR_012]], align 4 ; FVW2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 1 -; FVW2-NEXT: store float [[TMP73]], float* [[ARRAYIDX5]], align 4 +; FVW2-NEXT: store float [[TMP26]], float* [[ARRAYIDX5]], align 4 ; FVW2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[PTR_ADDR_012]], i64 1 ; FVW2-NEXT: [[ADD_PTR6]] = getelementptr inbounds float, float* [[DEST_ADDR_011]], i64 16 ; FVW2-NEXT: [[CMP_NOT:%.*]] = icmp eq float* [[INCDEC_PTR]], [[ADD_PTR]] -; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; FVW2-NEXT: br i1 [[CMP_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; FVW2: for.end: ; FVW2-NEXT: ret void ; Index: llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -19,153 +19,95 @@ ; CHECK: for.body.preheader: ; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER8:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP6]], [[X]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[Y]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] -; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER8]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A:%.*]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[N_VEC]], -4 -; CHECK-NEXT: [[TMP1:%.*]] = lshr exact i64 [[TMP0]], 2 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP2]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], 12 -; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] -; CHECK: vector.ph.new: -; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP2]], 9223372036854775804 -; CHECK-NEXT: [[TMP4:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP0:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP8]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP9]], align 8, !tbaa [[TBAA3:![0-9]+]], !alias.scope !7 -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>* -; CHECK-NEXT: store <4 x double> [[TMP10]], <4 x double>* [[TMP12]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 -; CHECK-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX_NEXT]] -; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x double>, <4 x double>* [[TMP14]], align 8, !tbaa [[TBAA3]], !alias.scope !7 -; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_1]], [[TMP5]] -; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX_NEXT]] -; CHECK-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* -; CHECK-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP17]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 -; CHECK-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX_NEXT_1]] -; CHECK-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x double>, <4 x double>* [[TMP19]], align 8, !tbaa [[TBAA3]], !alias.scope !7 -; CHECK-NEXT: [[TMP20:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_2]], [[TMP6]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX_NEXT_1]] -; CHECK-NEXT: [[TMP22:%.*]] = bitcast double* [[TMP21]] to <4 x double>* -; CHECK-NEXT: store <4 x double> [[TMP20]], <4 x double>* [[TMP22]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 -; CHECK-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX]], 12 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX_NEXT_2]] -; CHECK-NEXT: [[TMP24:%.*]] = bitcast double* [[TMP23]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x double>, <4 x double>* [[TMP24]], align 8, !tbaa [[TBAA3]], !alias.scope !7 -; CHECK-NEXT: [[TMP25:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_3]], [[TMP7]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX_NEXT_2]] -; CHECK-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>* -; CHECK-NEXT: store <4 x double> [[TMP25]], <4 x double>* [[TMP27]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 -; CHECK-NEXT: [[INDEX_NEXT_3]] = add i64 [[INDEX]], 16 -; CHECK-NEXT: [[NITER_NSUB_3]] = add i64 [[NITER]], -4 -; CHECK-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NSUB_3]], 0 -; CHECK-NEXT: br i1 [[NITER_NCMP_3]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK: middle.block.unr-lcssa: -; CHECK-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_3]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL_PREHEADER:%.*]] -; CHECK: vector.body.epil.preheader: -; CHECK-NEXT: [[TMP28:%.*]] = fdiv fast <4 x double> , [[BROADCAST_SPLAT]] -; CHECK-NEXT: br label [[VECTOR_BODY_EPIL:%.*]] -; CHECK: vector.body.epil: -; CHECK-NEXT: [[INDEX_EPIL:%.*]] = phi i64 [ [[INDEX_UNR]], [[VECTOR_BODY_EPIL_PREHEADER]] ], [ [[INDEX_NEXT_EPIL:%.*]], [[VECTOR_BODY_EPIL]] ] -; CHECK-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[XTRAITER]], [[VECTOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_SUB:%.*]], [[VECTOR_BODY_EPIL]] ] -; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX_EPIL]] -; CHECK-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <4 x double>, <4 x double>* [[TMP30]], align 8, !tbaa [[TBAA3]], !alias.scope !7 -; CHECK-NEXT: [[TMP31:%.*]] = fmul fast <4 x double> [[WIDE_LOAD_EPIL]], [[TMP28]] -; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX_EPIL]] -; CHECK-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* -; CHECK-NEXT: store <4 x double> [[TMP31]], <4 x double>* [[TMP33]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 -; CHECK-NEXT: [[INDEX_NEXT_EPIL]] = add i64 [[INDEX_EPIL]], 4 -; CHECK-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 -; CHECK-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 -; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[MIDDLE_BLOCK]], label [[VECTOR_BODY_EPIL]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[TMP1]] to <4 x double>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]], !alias.scope !7 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> [[WIDE_LOAD]], [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[TMP5]], align 8, !tbaa [[TBAA3]], !alias.scope !10, !noalias !7 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER8]] -; CHECK: for.body.preheader8: -; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[TMP34:%.*]] = xor i64 [[INDVARS_IV_PH]], -1 -; CHECK-NEXT: [[TMP35:%.*]] = add nsw i64 [[TMP34]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: [[XTRAITER9:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 3 -; CHECK-NEXT: [[LCMP_MOD10_NOT:%.*]] = icmp eq i64 [[XTRAITER9]], 0 -; CHECK-NEXT: br i1 [[LCMP_MOD10_NOT]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: [[TMP7:%.*]] = xor i64 [[BC_RESUME_VAL]], -1 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP7]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 3 +; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL_PREHEADER:%.*]] ; CHECK: for.body.prol.preheader: -; CHECK-NEXT: [[TMP36:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP9:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: br label [[FOR_BODY_PROL:%.*]] ; CHECK: for.body.prol: -; CHECK-NEXT: [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PROL_PREHEADER]] ] -; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_BODY_PROL]] ], [ [[XTRAITER9]], [[FOR_BODY_PROL_PREHEADER]] ] +; CHECK-NEXT: [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_BODY_PROL]] ], [ [[BC_RESUME_VAL]], [[FOR_BODY_PROL_PREHEADER]] ] +; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_BODY_PROL]] ], [ [[XTRAITER]], [[FOR_BODY_PROL_PREHEADER]] ] ; CHECK-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_PROL]] ; CHECK-NEXT: [[T0_PROL:%.*]] = load double, double* [[ARRAYIDX_PROL]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP37:%.*]] = fmul fast double [[T0_PROL]], [[TMP36]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast double [[T0_PROL]], [[TMP9]] ; CHECK-NEXT: [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_PROL]] -; CHECK-NEXT: store double [[TMP37]], double* [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP10]], double* [[ARRAYIDX2_PROL]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 1 ; CHECK-NEXT: [[PROL_ITER_SUB]] = add i64 [[PROL_ITER]], -1 ; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_SUB]], 0 -; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[FOR_BODY_PROL_LOOPEXIT]], label [[FOR_BODY_PROL]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.body.prol.loopexit: -; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER8]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] -; CHECK-NEXT: [[TMP38:%.*]] = icmp ult i64 [[TMP35]], 3 -; CHECK-NEXT: br i1 [[TMP38]], label [[FOR_END]], label [[FOR_BODY_PREHEADER8_NEW:%.*]] -; CHECK: for.body.preheader8.new: -; CHECK-NEXT: [[TMP39:%.*]] = fdiv fast double 1.000000e+00, [[A]] -; CHECK-NEXT: [[TMP40:%.*]] = fdiv fast double 1.000000e+00, [[A]] -; CHECK-NEXT: [[TMP41:%.*]] = fdiv fast double 1.000000e+00, [[A]] -; CHECK-NEXT: [[TMP42:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL]] ] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP8]], 3 +; CHECK-NEXT: br i1 [[TMP11]], label [[FOR_END]], label [[FOR_BODY_PREHEADER8:%.*]] +; CHECK: for.body.preheader8: +; CHECK-NEXT: [[TMP12:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP13:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP14:%.*]] = fdiv fast double 1.000000e+00, [[A]] +; CHECK-NEXT: [[TMP15:%.*]] = fdiv fast double 1.000000e+00, [[A]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER8_NEW]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_UNR]], [[FOR_BODY_PREHEADER8]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[T0:%.*]] = load double, double* [[ARRAYIDX]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP43:%.*]] = fmul fast double [[T0]], [[TMP39]] +; CHECK-NEXT: [[TMP16:%.*]] = fmul fast double [[T0]], [[TMP12]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store double [[TMP43]], double* [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP16]], double* [[ARRAYIDX2]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[T0_1:%.*]] = load double, double* [[ARRAYIDX_1]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP44:%.*]] = fmul fast double [[T0_1]], [[TMP40]] +; CHECK-NEXT: [[TMP17:%.*]] = fmul fast double [[T0_1]], [[TMP13]] ; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: store double [[TMP44]], double* [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP17]], double* [[ARRAYIDX2_1]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_NEXT_1]] ; CHECK-NEXT: [[T0_2:%.*]] = load double, double* [[ARRAYIDX_2]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP45:%.*]] = fmul fast double [[T0_2]], [[TMP41]] +; CHECK-NEXT: [[TMP18:%.*]] = fmul fast double [[T0_2]], [[TMP14]] ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_NEXT_1]] -; CHECK-NEXT: store double [[TMP45]], double* [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP18]], double* [[ARRAYIDX2_2]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[INDVARS_IV_NEXT_2]] ; CHECK-NEXT: [[T0_3:%.*]] = load double, double* [[ARRAYIDX_3]], align 8, !tbaa [[TBAA3]] -; CHECK-NEXT: [[TMP46:%.*]] = fmul fast double [[T0_3]], [[TMP42]] +; CHECK-NEXT: [[TMP19:%.*]] = fmul fast double [[T0_3]], [[TMP15]] ; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[INDVARS_IV_NEXT_2]] -; CHECK-NEXT: store double [[TMP46]], double* [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]] +; CHECK-NEXT: store double [[TMP19]], double* [[ARRAYIDX2_3]], align 8, !tbaa [[TBAA3]] ; CHECK-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 ; CHECK-NEXT: [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ;