diff --git a/llvm/include/llvm/Transforms/Scalar/InferAlignment.h b/llvm/include/llvm/Transforms/Scalar/InferAlignment.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/InferAlignment.h @@ -0,0 +1,27 @@ +//===- InferAlignment.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Infer alignment for load, stores and other memory operations based on +// trailing zero known bits information. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_INFERALIGNMENT_H +#define LLVM_TRANSFORMS_SCALAR_INFERALIGNMENT_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +struct InferAlignmentPass : public PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_INFERALIGNMENT_H diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -203,6 +203,15 @@ /// deleted and it returns the pointer to the alloca inserted. AllocaInst *DemotePHIToStack(PHINode *P, Instruction *AllocaPoint = nullptr); +/// If the specified pointer points to an object that we control, try to modify +/// the object's alignment to PrefAlign. Returns a minimum known alignment of +/// the value after the operation, which may be lower than PrefAlign. +/// +/// Increating value alignment isn't often possible though. If alignment is +/// important, a more reliable approach is to simply align all global variables +/// and allocation instructions to their preferred alignment from the beginning. +Align tryEnforceAlignment(Value *V, Align PrefAlign, const DataLayout &DL); + /// Try to ensure that the alignment of \p V is at least \p PrefAlign bytes. If /// the owning object can be modified and has an alignment less than \p /// PrefAlign, it will be increased and \p PrefAlign returned. If the alignment diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -170,6 +170,7 @@ #include "llvm/Transforms/Scalar/IndVarSimplify.h" #include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h" #include "llvm/Transforms/Scalar/InferAddressSpaces.h" +#include "llvm/Transforms/Scalar/InferAlignment.h" #include "llvm/Transforms/Scalar/InstSimplifyPass.h" #include "llvm/Transforms/Scalar/JumpThreading.h" #include "llvm/Transforms/Scalar/LICM.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -88,6 +88,7 @@ #include "llvm/Transforms/Scalar/Float2Int.h" #include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Scalar/IndVarSimplify.h" +#include "llvm/Transforms/Scalar/InferAlignment.h" #include "llvm/Transforms/Scalar/InstSimplifyPass.h" #include "llvm/Transforms/Scalar/JumpThreading.h" #include "llvm/Transforms/Scalar/LICM.h" @@ -1138,6 +1139,7 @@ FPM.addPass(LoopVectorizePass( LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); + FPM.addPass(InferAlignmentPass()); if (IsFullLTO) { // The vectorizer may have significantly shortened a loop body; unroll // again. Unroll small loops to hide loop backedge latency and saturate any diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -327,6 +327,7 @@ FUNCTION_PASS("gvn-sink", GVNSinkPass()) FUNCTION_PASS("helloworld", HelloWorldPass()) FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass()) +FUNCTION_PASS("infer-alignment", InferAlignmentPass()) FUNCTION_PASS("instcount", InstCountPass()) FUNCTION_PASS("instsimplify", InstSimplifyPass()) FUNCTION_PASS("invalidate", InvalidateAllAnalysesPass()) diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -22,6 +22,7 @@ InductiveRangeCheckElimination.cpp IndVarSimplify.cpp InferAddressSpaces.cpp + InferAlignment.cpp InstSimplifyPass.cpp JumpThreading.cpp LICM.cpp diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -0,0 +1,92 @@ +//===- InferAlignment.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Infer alignment for load, stores and other memory operations based on +// trailing zero known bits information. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/InferAlignment.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Instructions.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; + +static bool tryToImproveAlign( + const DataLayout &DL, Instruction *I, + llvm::function_ref + Fn) { + if (auto *LI = dyn_cast(I)) { + Value *PtrOp = LI->getPointerOperand(); + Align OldAlign = LI->getAlign(); + Align NewAlign = Fn(PtrOp, OldAlign, DL.getPrefTypeAlign(LI->getType())); + if (NewAlign > OldAlign) { + LI->setAlignment(NewAlign); + return true; + } + } else if (auto *SI = dyn_cast(I)) { + Value *PtrOp = SI->getPointerOperand(); + Value *ValOp = SI->getValueOperand(); + Align OldAlign = SI->getAlign(); + Align NewAlign = Fn(PtrOp, OldAlign, DL.getPrefTypeAlign(ValOp->getType())); + if (NewAlign > OldAlign) { + SI->setAlignment(NewAlign); + return true; + } + } + // TODO: Also handle memory intrinsics. + return false; +} + +bool inferAlignment(Function &F, AssumptionCache &AC, DominatorTree &DT) { + const DataLayout &DL = F.getParent()->getDataLayout(); + bool Changed = false; + + // Enforce preferred type alignment if possible. We do this as a separate + // pass first, because it may improve the alignments we infer below. + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + Changed |= tryToImproveAlign( + DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) { + if (PrefAlign > OldAlign) + return std::max(OldAlign, + tryEnforceAlignment(PtrOp, PrefAlign, DL)); + return OldAlign; + }); + } + } + + // Compute alignment from known bits. + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + Changed |= tryToImproveAlign( + DL, &I, [&](Value *PtrOp, Align OldAlign, Align PrefAlign) { + KnownBits Known = computeKnownBits(PtrOp, DL, 0, &AC, &I, &DT); + unsigned TrailZ = std::min(Known.countMinTrailingZeros(), + +Value::MaxAlignmentExponent); + return Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ)); + }); + } + } + + return Changed; +} + +PreservedAnalyses InferAlignmentPass::run(Function &F, + FunctionAnalysisManager &AM) { + AssumptionCache &AC = AM.getResult(F); + DominatorTree &DT = AM.getResult(F); + inferAlignment(F, AC, DT); + // Changes to alignment shouldn't invalidated analyses. + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1371,15 +1371,8 @@ return EliminateDuplicatePHINodesSetBasedImpl(BB); } -/// If the specified pointer points to an object that we control, try to modify -/// the object's alignment to PrefAlign. Returns a minimum known alignment of -/// the value after the operation, which may be lower than PrefAlign. -/// -/// Increating value alignment isn't often possible though. If alignment is -/// important, a more reliable approach is to simply align all global variables -/// and allocation instructions to their preferred alignment from the beginning. -static Align tryEnforceAlignment(Value *V, Align PrefAlign, - const DataLayout &DL) { +Align llvm::tryEnforceAlignment(Value *V, Align PrefAlign, + const DataLayout &DL) { V = V->stripPointerCasts(); if (AllocaInst *AI = dyn_cast(V)) { diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll --- a/llvm/test/Other/new-pm-defaults.ll +++ b/llvm/test/Other/new-pm-defaults.ll @@ -245,6 +245,7 @@ ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-O-NEXT: Running pass: InferAlignmentPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll --- a/llvm/test/Other/new-pm-lto-defaults.ll +++ b/llvm/test/Other/new-pm-lto-defaults.ll @@ -116,6 +116,7 @@ ; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo ; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo +; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo ; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo ; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo ; CHECK-O23SZ-NEXT: Running pass: SROAPass on foo diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll @@ -172,6 +172,7 @@ ; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll @@ -160,6 +160,7 @@ ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-O-NEXT: Running pass: InferAlignmentPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll --- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll +++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll @@ -167,6 +167,7 @@ ; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis ; CHECK-O-NEXT: Running pass: InjectTLIMappings ; CHECK-O-NEXT: Running pass: LoopVectorizePass +; CHECK-O-NEXT: Running pass: InferAlignmentPass ; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: SimplifyCFGPass diff --git a/llvm/test/Transforms/LoopUnroll/PowerPC/p10-respect-unroll-pragma.ll b/llvm/test/Transforms/LoopUnroll/PowerPC/p10-respect-unroll-pragma.ll --- a/llvm/test/Transforms/LoopUnroll/PowerPC/p10-respect-unroll-pragma.ll +++ b/llvm/test/Transforms/LoopUnroll/PowerPC/p10-respect-unroll-pragma.ll @@ -14,97 +14,97 @@ ; CHECK-NEXT: [[I24_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_1:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_1:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_1]]) ; CHECK-NEXT: [[I24_ELT_1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_1]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_1]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_1]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_1]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_1]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_2:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_2:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_2]]) ; CHECK-NEXT: [[I24_ELT_2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_2]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_2]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_2]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_2]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_2]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_3:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_3:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_3]]) ; CHECK-NEXT: [[I24_ELT_3:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_3]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_3]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_3:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_3]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_3]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_3]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_4:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_4:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_4]]) ; CHECK-NEXT: [[I24_ELT_4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_4]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_4]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_4]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_4]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_4]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_5:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_5:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_5]]) ; CHECK-NEXT: [[I24_ELT_5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_5]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_5]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_5:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_5]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_5]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_5]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_6:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_6:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_6]]) ; CHECK-NEXT: [[I24_ELT_6:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_6]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_6]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_6:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_6]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_6]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_6]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_7:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_7:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_7]]) ; CHECK-NEXT: [[I24_ELT_7:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_7]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_7]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_7:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_7]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_7]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_7]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_8:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_8:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_8]]) ; CHECK-NEXT: [[I24_ELT_8:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_8]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_8]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_8:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_8]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_8]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_8]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_9:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_9:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_9]]) ; CHECK-NEXT: [[I24_ELT_9:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_9]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_9]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_9:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_9]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_9]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_9]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_10:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_10:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_10]]) ; CHECK-NEXT: [[I24_ELT_10:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_10]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_10]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_10:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_10]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_10]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_10]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_11:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_11:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_11]]) ; CHECK-NEXT: [[I24_ELT_11:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_11]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_11]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_11:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_11]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_11]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_11]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_12:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_12:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_12]]) ; CHECK-NEXT: [[I24_ELT_12:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_12]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_12]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_12:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_12]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_12]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_12]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_13:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_13:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_13]]) ; CHECK-NEXT: [[I24_ELT_13:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_13]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_13]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_13:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_13]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_13]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_13]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_14:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_14:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_14]]) ; CHECK-NEXT: [[I24_ELT_14:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_14]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_14]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_14:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_14]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_14]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_14]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: [[I20_15:%.*]] = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr nonnull inttoptr (i64 -32 to ptr)) ; CHECK-NEXT: [[I24_15:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> [[I20_15]]) ; CHECK-NEXT: [[I24_ELT_15:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_15]], 0 ; CHECK-NEXT: store <16 x i8> [[I24_ELT_15]], ptr inttoptr (i64 48 to ptr), align 16 ; CHECK-NEXT: [[I24_ELT1_15:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[I24_15]], 1 -; CHECK-NEXT: store <16 x i8> [[I24_ELT1_15]], ptr inttoptr (i64 64 to ptr), align 16 +; CHECK-NEXT: store <16 x i8> [[I24_ELT1_15]], ptr inttoptr (i64 64 to ptr), align 64 ; CHECK-NEXT: br label [[BB16]], !llvm.loop [[LOOP0:![0-9]+]] ; bb: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll @@ -12,126 +12,126 @@ define void @test_known_trip_count() { ; CHECK-LABEL: @test_known_trip_count( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr @b, align 8 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr @c, align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr @b, align 16 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x double>, ptr @c, align 16 ; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD3]] -; CHECK-NEXT: store <2 x double> [[TMP0]], ptr @a, align 8 -; CHECK-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 2), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 2), align 8 +; CHECK-NEXT: store <2 x double> [[TMP0]], ptr @a, align 16 +; CHECK-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 2), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 2), align 16 ; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[WIDE_LOAD_1]], [[WIDE_LOAD3_1]] -; CHECK-NEXT: store <2 x double> [[TMP1]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 2), align 8 -; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 4), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 4), align 8 +; CHECK-NEXT: store <2 x double> [[TMP1]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 2), align 16 +; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 4), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 4), align 16 ; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[WIDE_LOAD_2]], [[WIDE_LOAD3_2]] -; CHECK-NEXT: store <2 x double> [[TMP2]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 4), align 8 -; CHECK-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 6), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 6), align 8 +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 4), align 16 +; CHECK-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 6), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 6), align 16 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[WIDE_LOAD_3]], [[WIDE_LOAD3_3]] -; CHECK-NEXT: store <2 x double> [[TMP3]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 6), align 8 -; CHECK-NEXT: [[WIDE_LOAD_4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 8), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 8), align 8 +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 6), align 16 +; CHECK-NEXT: [[WIDE_LOAD_4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 8), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 8), align 16 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[WIDE_LOAD_4]], [[WIDE_LOAD3_4]] -; CHECK-NEXT: store <2 x double> [[TMP4]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 8), align 8 -; CHECK-NEXT: [[WIDE_LOAD_5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 10), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 10), align 8 +; CHECK-NEXT: store <2 x double> [[TMP4]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 8), align 16 +; CHECK-NEXT: [[WIDE_LOAD_5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 10), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_5:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 10), align 16 ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[WIDE_LOAD_5]], [[WIDE_LOAD3_5]] -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 10), align 8 -; CHECK-NEXT: [[WIDE_LOAD_6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 12), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 12), align 8 +; CHECK-NEXT: store <2 x double> [[TMP5]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 10), align 16 +; CHECK-NEXT: [[WIDE_LOAD_6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 12), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_6:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 12), align 16 ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[WIDE_LOAD_6]], [[WIDE_LOAD3_6]] -; CHECK-NEXT: store <2 x double> [[TMP6]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 12), align 8 -; CHECK-NEXT: [[WIDE_LOAD_7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 14), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 14), align 8 +; CHECK-NEXT: store <2 x double> [[TMP6]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 12), align 16 +; CHECK-NEXT: [[WIDE_LOAD_7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 14), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_7:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 14), align 16 ; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[WIDE_LOAD_7]], [[WIDE_LOAD3_7]] -; CHECK-NEXT: store <2 x double> [[TMP7]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 14), align 8 -; CHECK-NEXT: [[WIDE_LOAD_8:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 16), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_8:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 16), align 8 +; CHECK-NEXT: store <2 x double> [[TMP7]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 14), align 16 +; CHECK-NEXT: [[WIDE_LOAD_8:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 16), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_8:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 16), align 16 ; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[WIDE_LOAD_8]], [[WIDE_LOAD3_8]] -; CHECK-NEXT: store <2 x double> [[TMP8]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 16), align 8 -; CHECK-NEXT: [[WIDE_LOAD_9:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 18), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_9:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 18), align 8 +; CHECK-NEXT: store <2 x double> [[TMP8]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 16), align 16 +; CHECK-NEXT: [[WIDE_LOAD_9:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 18), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_9:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 18), align 16 ; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[WIDE_LOAD_9]], [[WIDE_LOAD3_9]] -; CHECK-NEXT: store <2 x double> [[TMP9]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 18), align 8 -; CHECK-NEXT: [[WIDE_LOAD_10:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 20), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_10:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 20), align 8 +; CHECK-NEXT: store <2 x double> [[TMP9]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 18), align 16 +; CHECK-NEXT: [[WIDE_LOAD_10:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 20), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_10:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 20), align 16 ; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[WIDE_LOAD_10]], [[WIDE_LOAD3_10]] -; CHECK-NEXT: store <2 x double> [[TMP10]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 20), align 8 -; CHECK-NEXT: [[WIDE_LOAD_11:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 22), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_11:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 22), align 8 +; CHECK-NEXT: store <2 x double> [[TMP10]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 20), align 16 +; CHECK-NEXT: [[WIDE_LOAD_11:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 22), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_11:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 22), align 16 ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[WIDE_LOAD_11]], [[WIDE_LOAD3_11]] -; CHECK-NEXT: store <2 x double> [[TMP11]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 22), align 8 -; CHECK-NEXT: [[WIDE_LOAD_12:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 24), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_12:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 24), align 8 +; CHECK-NEXT: store <2 x double> [[TMP11]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 22), align 16 +; CHECK-NEXT: [[WIDE_LOAD_12:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 24), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_12:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 24), align 16 ; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> [[WIDE_LOAD_12]], [[WIDE_LOAD3_12]] -; CHECK-NEXT: store <2 x double> [[TMP12]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 24), align 8 -; CHECK-NEXT: [[WIDE_LOAD_13:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 26), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_13:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 26), align 8 +; CHECK-NEXT: store <2 x double> [[TMP12]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 24), align 16 +; CHECK-NEXT: [[WIDE_LOAD_13:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 26), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_13:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 26), align 16 ; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x double> [[WIDE_LOAD_13]], [[WIDE_LOAD3_13]] -; CHECK-NEXT: store <2 x double> [[TMP13]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 26), align 8 -; CHECK-NEXT: [[WIDE_LOAD_14:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 28), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_14:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 28), align 8 +; CHECK-NEXT: store <2 x double> [[TMP13]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 26), align 16 +; CHECK-NEXT: [[WIDE_LOAD_14:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 28), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_14:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 28), align 16 ; CHECK-NEXT: [[TMP14:%.*]] = fadd <2 x double> [[WIDE_LOAD_14]], [[WIDE_LOAD3_14]] -; CHECK-NEXT: store <2 x double> [[TMP14]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 28), align 8 -; CHECK-NEXT: [[WIDE_LOAD_15:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 30), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_15:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 30), align 8 +; CHECK-NEXT: store <2 x double> [[TMP14]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 28), align 16 +; CHECK-NEXT: [[WIDE_LOAD_15:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 30), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_15:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 30), align 16 ; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x double> [[WIDE_LOAD_15]], [[WIDE_LOAD3_15]] -; CHECK-NEXT: store <2 x double> [[TMP15]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 30), align 8 -; CHECK-NEXT: [[WIDE_LOAD_16:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 32), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_16:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 32), align 8 +; CHECK-NEXT: store <2 x double> [[TMP15]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 30), align 16 +; CHECK-NEXT: [[WIDE_LOAD_16:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 32), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_16:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 32), align 16 ; CHECK-NEXT: [[TMP16:%.*]] = fadd <2 x double> [[WIDE_LOAD_16]], [[WIDE_LOAD3_16]] -; CHECK-NEXT: store <2 x double> [[TMP16]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 32), align 8 -; CHECK-NEXT: [[WIDE_LOAD_17:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 34), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_17:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 34), align 8 +; CHECK-NEXT: store <2 x double> [[TMP16]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 32), align 16 +; CHECK-NEXT: [[WIDE_LOAD_17:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 34), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_17:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 34), align 16 ; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x double> [[WIDE_LOAD_17]], [[WIDE_LOAD3_17]] -; CHECK-NEXT: store <2 x double> [[TMP17]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 34), align 8 -; CHECK-NEXT: [[WIDE_LOAD_18:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 36), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_18:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 36), align 8 +; CHECK-NEXT: store <2 x double> [[TMP17]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 34), align 16 +; CHECK-NEXT: [[WIDE_LOAD_18:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 36), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_18:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 36), align 16 ; CHECK-NEXT: [[TMP18:%.*]] = fadd <2 x double> [[WIDE_LOAD_18]], [[WIDE_LOAD3_18]] -; CHECK-NEXT: store <2 x double> [[TMP18]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 36), align 8 -; CHECK-NEXT: [[WIDE_LOAD_19:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 38), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_19:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 38), align 8 +; CHECK-NEXT: store <2 x double> [[TMP18]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 36), align 16 +; CHECK-NEXT: [[WIDE_LOAD_19:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 38), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_19:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 38), align 16 ; CHECK-NEXT: [[TMP19:%.*]] = fadd <2 x double> [[WIDE_LOAD_19]], [[WIDE_LOAD3_19]] -; CHECK-NEXT: store <2 x double> [[TMP19]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 38), align 8 -; CHECK-NEXT: [[WIDE_LOAD_20:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 40), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_20:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 40), align 8 +; CHECK-NEXT: store <2 x double> [[TMP19]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 38), align 16 +; CHECK-NEXT: [[WIDE_LOAD_20:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 40), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_20:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 40), align 16 ; CHECK-NEXT: [[TMP20:%.*]] = fadd <2 x double> [[WIDE_LOAD_20]], [[WIDE_LOAD3_20]] -; CHECK-NEXT: store <2 x double> [[TMP20]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 40), align 8 -; CHECK-NEXT: [[WIDE_LOAD_21:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 42), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_21:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 42), align 8 +; CHECK-NEXT: store <2 x double> [[TMP20]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 40), align 16 +; CHECK-NEXT: [[WIDE_LOAD_21:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 42), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_21:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 42), align 16 ; CHECK-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[WIDE_LOAD_21]], [[WIDE_LOAD3_21]] -; CHECK-NEXT: store <2 x double> [[TMP21]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 42), align 8 -; CHECK-NEXT: [[WIDE_LOAD_22:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 44), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_22:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 44), align 8 +; CHECK-NEXT: store <2 x double> [[TMP21]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 42), align 16 +; CHECK-NEXT: [[WIDE_LOAD_22:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 44), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_22:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 44), align 16 ; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[WIDE_LOAD_22]], [[WIDE_LOAD3_22]] -; CHECK-NEXT: store <2 x double> [[TMP22]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 44), align 8 -; CHECK-NEXT: [[WIDE_LOAD_23:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 46), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_23:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 46), align 8 +; CHECK-NEXT: store <2 x double> [[TMP22]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 44), align 16 +; CHECK-NEXT: [[WIDE_LOAD_23:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 46), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_23:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 46), align 16 ; CHECK-NEXT: [[TMP23:%.*]] = fadd <2 x double> [[WIDE_LOAD_23]], [[WIDE_LOAD3_23]] -; CHECK-NEXT: store <2 x double> [[TMP23]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 46), align 8 -; CHECK-NEXT: [[WIDE_LOAD_24:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 48), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_24:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 48), align 8 +; CHECK-NEXT: store <2 x double> [[TMP23]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 46), align 16 +; CHECK-NEXT: [[WIDE_LOAD_24:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 48), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_24:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 48), align 16 ; CHECK-NEXT: [[TMP24:%.*]] = fadd <2 x double> [[WIDE_LOAD_24]], [[WIDE_LOAD3_24]] -; CHECK-NEXT: store <2 x double> [[TMP24]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 48), align 8 -; CHECK-NEXT: [[WIDE_LOAD_25:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 50), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_25:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 50), align 8 +; CHECK-NEXT: store <2 x double> [[TMP24]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 48), align 16 +; CHECK-NEXT: [[WIDE_LOAD_25:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 50), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_25:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 50), align 16 ; CHECK-NEXT: [[TMP25:%.*]] = fadd <2 x double> [[WIDE_LOAD_25]], [[WIDE_LOAD3_25]] -; CHECK-NEXT: store <2 x double> [[TMP25]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 50), align 8 -; CHECK-NEXT: [[WIDE_LOAD_26:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 52), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_26:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 52), align 8 +; CHECK-NEXT: store <2 x double> [[TMP25]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 50), align 16 +; CHECK-NEXT: [[WIDE_LOAD_26:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 52), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_26:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 52), align 16 ; CHECK-NEXT: [[TMP26:%.*]] = fadd <2 x double> [[WIDE_LOAD_26]], [[WIDE_LOAD3_26]] -; CHECK-NEXT: store <2 x double> [[TMP26]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 52), align 8 -; CHECK-NEXT: [[WIDE_LOAD_27:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 54), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_27:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 54), align 8 +; CHECK-NEXT: store <2 x double> [[TMP26]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 52), align 16 +; CHECK-NEXT: [[WIDE_LOAD_27:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 54), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_27:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 54), align 16 ; CHECK-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[WIDE_LOAD_27]], [[WIDE_LOAD3_27]] -; CHECK-NEXT: store <2 x double> [[TMP27]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 54), align 8 -; CHECK-NEXT: [[WIDE_LOAD_28:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 56), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_28:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 56), align 8 +; CHECK-NEXT: store <2 x double> [[TMP27]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 54), align 16 +; CHECK-NEXT: [[WIDE_LOAD_28:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 0, i64 56), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_28:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 0, i64 56), align 16 ; CHECK-NEXT: [[TMP28:%.*]] = fadd <2 x double> [[WIDE_LOAD_28]], [[WIDE_LOAD3_28]] -; CHECK-NEXT: store <2 x double> [[TMP28]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 56), align 8 -; CHECK-NEXT: [[WIDE_LOAD_29:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 1, i64 0), align 8 -; CHECK-NEXT: [[WIDE_LOAD3_29:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 1, i64 0), align 8 +; CHECK-NEXT: store <2 x double> [[TMP28]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 0, i64 56), align 16 +; CHECK-NEXT: [[WIDE_LOAD_29:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @b, i64 1, i64 0), align 16 +; CHECK-NEXT: [[WIDE_LOAD3_29:%.*]] = load <2 x double>, ptr getelementptr inbounds ([58 x double], ptr @c, i64 1, i64 0), align 16 ; CHECK-NEXT: [[TMP29:%.*]] = fadd <2 x double> [[WIDE_LOAD_29]], [[WIDE_LOAD3_29]] -; CHECK-NEXT: store <2 x double> [[TMP29]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 1, i64 0), align 8 +; CHECK-NEXT: store <2 x double> [[TMP29]], ptr getelementptr inbounds ([58 x double], ptr @a, i64 1, i64 0), align 16 ; CHECK-NEXT: [[TMP30:%.*]] = load double, ptr getelementptr inbounds ([58 x double], ptr @b, i64 1, i64 2), align 8 ; CHECK-NEXT: [[TMP31:%.*]] = load double, ptr getelementptr inbounds ([58 x double], ptr @c, i64 1, i64 2), align 8 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP30]], [[TMP31]] @@ -180,19 +180,19 @@ ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr [[TMP1]], align 16 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP2]], align 16 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 2 -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD5]] ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[WIDE_LOAD4]], [[WIDE_LOAD6]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX]] -; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TMP6]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TMP6]], align 16 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 2 -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TMP7]], align 16 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]