diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2381,6 +2381,15 @@ Type *Ty, unsigned AddrSpace, Instruction *I = nullptr) const; + /// Return true if it is beneficial to retain post-indexing-friendly patterns + /// while performing optimizations. + virtual bool shouldRetainImmediatePostIncrement(const DataLayout &DL, + Type *Ty, CombineLevel Level, + unsigned AddrSpace, + int64_t Increment) const { + return false; + } + /// Return the cost of the scaling factor used in the addressing mode /// represented by AM for this target, for a load/store of the specified type. /// diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -524,6 +524,7 @@ bool reassociationCanBreakAddressingModePattern(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1); + bool reassociationCanBreakPostIndexingPattern(SDNode *N); SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1); SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, @@ -1053,6 +1054,37 @@ return false; } +bool DAGCombiner::reassociationCanBreakPostIndexingPattern(SDNode *N) { + const DataLayout &DL = DAG.getDataLayout(); + if (N->getOpcode() != ISD::ADD) + return false; + + auto Const = dyn_cast(N->getOperand(1)); + if (!Const) + return false; + + const APInt &APIntVal = Const->getAPIntValue(); + if (APIntVal.getBitWidth() > 64) + return false; + const int64_t ConstValue = APIntVal.getSExtValue(); + + // Check for (load/store (add x, const)) + + for (SDNode *Node : N->getOperand(0)->uses()) { + auto LoadStore = dyn_cast(Node); + if (!LoadStore) + continue; + + EVT VT = LoadStore->getMemoryVT(); + Type *AccessTy = VT.getTypeForEVT(*DAG.getContext()); + unsigned AS = LoadStore->getAddressSpace(); + if (TLI.shouldRetainImmediatePostIncrement(DL, AccessTy, Level, AS, + ConstValue)) + return true; + } + return false; +} + // Helper for DAGCombiner::reassociateOps. Try to reassociate an expression // such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc. SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, @@ -2488,6 +2520,13 @@ EVT VT = N0.getValueType(); SDLoc DL(N); + // Prevent ADD reassociation as well as converting ADD -> OR + if (reassociationCanBreakPostIndexingPattern(N) || + reassociationCanBreakPostIndexingPattern(N0.getNode()) || + reassociationCanBreakPostIndexingPattern(N1.getNode())) { + return SDValue(); + } + if (SDValue Combined = visitADDLike(N)) return Combined; diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h --- a/llvm/lib/Target/ARM/ARM.h +++ b/llvm/lib/Target/ARM/ARM.h @@ -43,6 +43,7 @@ CodeGenOpt::Level OptLevel); FunctionPass *createA15SDOptimizerPass(); FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); +FunctionPass *createARMPostIndexingOptimizationPass(); FunctionPass *createARMExpandPseudoPass(); FunctionPass *createARMConstantIslandPass(); FunctionPass *createMLxExpansionPass(); @@ -65,6 +66,7 @@ void initializeARMParallelDSPPass(PassRegistry &); void initializeARMLoadStoreOptPass(PassRegistry &); +void initializeARMPostIndexingOptPass(PassRegistry &); void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); void initializeARMConstantIslandsPass(PassRegistry &); void initializeARMExpandPseudoPass(PassRegistry &); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -469,6 +469,11 @@ Type *Ty, unsigned AS, Instruction *I = nullptr) const override; + bool shouldRetainImmediatePostIncrement(const DataLayout &DL, Type *Ty, + CombineLevel Level, + unsigned AddrSpace, + int64_t Increment) const override; + /// getScalingFactorCost - Return the cost of the scaling used in /// addressing mode represented by AM. /// If the AM is supported, the return value must be >= 0. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -18707,6 +18707,27 @@ return true; } +bool ARMTargetLowering::shouldRetainImmediatePostIncrement( + const DataLayout &DL, Type *Ty, CombineLevel Level, unsigned AddrSpace, + int64_t Increment) const { + // NEON has rather restricted address calculation for vector load / store + // instructions compared to MVE or AArch64 ASIMD. + if (Subtarget->hasMVEIntegerOps()) + return false; + + // If the first DAG optimization pass did not consume this increment, + // try combining as usual during subsequent optimization passes. + if (Level != CombineLevel::BeforeLegalizeTypes) + return false; + + if (!Ty->isVectorTy()) + return false; + + unsigned BitSize = DL.getTypeSizeInBits(Ty); + + return BitSize > 64 && isPowerOf2_32(BitSize); +} + /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can compare /// a register against the immediate without having to materialize the diff --git a/llvm/lib/Target/ARM/ARMPostIndexingOptimizer.cpp b/llvm/lib/Target/ARM/ARMPostIndexingOptimizer.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/ARM/ARMPostIndexingOptimizer.cpp @@ -0,0 +1,300 @@ +//===- ARMPostIndexingOptimizer.cpp - Prepare ld/st for post-indexing -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file ARMPostIndexingOpt pass transforms address operands of load/store +/// instructions to allow them to be emitted as NEON load/store with post-index +/// addressing mode. For example: +/// +/// %first = gep %base, %offset1 +/// load %first +/// %second = gep %base, %offset2 +/// load %second +/// +/// this sequence may be tranformed into: +/// +/// %first = gep %base, %offset1 +/// load %first +/// %second = gep %first, (%offset2 - %offset1) +/// load %second +/// +/// The transformation is done only if: +/// 1) GEPs are constant +/// 2) Difference between offsets is compatible with either "[Rn]!" +/// or "[Rn], Rm" addressing modes. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsARM.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-post-indexing-opt" +#define PASS_DESC "ARM post-indexed access optimizer" + +namespace { + +// LdStInfo the base address of a load or store, and an immediate offset of a +// memory access obtained by a best-effort heuristic as well as other useful +// instruction properties. +struct LdStInfo { + LdStInfo(const DataLayout &DL, Instruction *LdSt, unsigned BaseOperandIndex, + int AccessSize); + // A memory access in question + Instruction *LdSt; + // An actual operand of LdSt can be updated throughout this pass execution, + // so store an index instead + unsigned BaseOperandIndex; + // A guessed base address + Value *IndirectBase; + // An immediate offset to add to IndirectBase + int32_t Offset; + // An access size that can be used for post-indexed addressing mode + int AccessSize; + + // Returns current *direct* base operand + Value *getBaseOperand() { return LdSt->getOperand(BaseOperandIndex); } +}; + +struct ARMPostIndexingOpt : public FunctionPass { + static char ID; + + ARMPostIndexingOpt() : FunctionPass(ID) {} + + StringRef getPassName() const override { return PASS_DESC; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesCFG(); + } + + bool runOnFunction(Function &F) override; +}; + +} // end anonymous namespace + +char ARMPostIndexingOpt::ID = 0; +INITIALIZE_PASS(ARMPostIndexingOpt, DEBUG_TYPE, PASS_DESC, false, false) + +// Returns (nullptr, 0) for instructions not handled by this pass +static std::pair getDataTypeAndBaseIndex(Instruction *I) { + if (LoadInst *Load = dyn_cast(I)) + return std::make_pair(Load->getType(), 0); + if (StoreInst *Store = dyn_cast(I)) + return std::make_pair(Store->getValueOperand()->getType(), 1); + + if (IntrinsicInst *Intrinsic = dyn_cast(I)) { + switch (Intrinsic->getIntrinsicID()) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + return std::make_pair(Intrinsic->getType(), 0); + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + return std::make_pair(Intrinsic->getOperand(1)->getType(), 0); + default: + break; + } + } + return std::make_pair(nullptr, 0); +} + +LdStInfo::LdStInfo(const DataLayout &DL, Instruction *LdSt, + unsigned BaseOperandIndex, int AccessSize) + : LdSt(LdSt), BaseOperandIndex(BaseOperandIndex), Offset(0), + AccessSize(AccessSize) { + IndirectBase = LdSt->getOperand(BaseOperandIndex); + for (;;) { + IndirectBase = IndirectBase->stripPointerCasts(); + // Match GetElementPtrInst as well as corresponding ContantExpr + if (auto *GEP = dyn_cast(IndirectBase)) { + APInt APOffset(32, 0, /* isSigned = */ true); + if (GEP->accumulateConstantOffset(DL, APOffset)) { + IndirectBase = GEP->getPointerOperand(); + Offset += APOffset.getSExtValue(); + continue; + } + } + return; + } +} + +// Guess a common stride (aside from post-incrementing by access size) that +// is suitable for "[Rn], Rm" addressing mode, if any +static int32_t guessCustomAccessStride(ArrayRef Instructions) { + int32_t Stride = 0; // not decided + assert(!Instructions.empty()); + for (auto I = Instructions.begin(), End = Instructions.end(); + std::next(I) != End; ++I) { + int32_t ThisStride = std::next(I)->Offset - I->Offset; + // Check if "[Rn]!" addressing mode can be used + if (ThisStride == I->AccessSize) + continue; + // If this is the first instruction requiring a register operand, + // request this stride value + if (Stride == 0) + Stride = ThisStride; + // If multiple different stride values have to be used, + // conservatively refrain from using "[Rn], Rm" addressing mode + if (Stride != ThisStride) + return 0; + } + return Stride; +} + +// Rewrite a memory address used by Second to use address of First incremented +// by a constant value +static bool rewriteAddressCalculation(LdStInfo &First, LdStInfo &Second, + int32_t RegStride, + const TargetLibraryInfo &TLI) { + LLVM_DEBUG(dbgs() << "Rewriting load/store for post-indexing: "; + Second.LdSt->dump()); + + IRBuilder<> IRB(Second.LdSt); + const DataLayout &DL = Second.LdSt->getModule()->getDataLayout(); + + int32_t Stride = Second.Offset - First.Offset; + if (Stride != First.AccessSize && Stride != RegStride) + return false; + + // In case GEPOperand matched ContantExpr, replace it by instruction to + // prevent folding + if (auto Const = dyn_cast(First.getBaseOperand())) { + auto Inst = Const->getAsInstruction(); + Inst->insertBefore(First.LdSt); + First.LdSt->replaceUsesOfWith(Const, Inst); + } + + Value *FirstBase = First.getBaseOperand(); + Value *OldSecondBase = Second.getBaseOperand(); + PointerType *FirstBaseTy = cast(FirstBase->getType()); + PointerType *SecondBaseTy = cast(OldSecondBase->getType()); + assert(FirstBaseTy->getAddressSpace() == 0 && "Unexpected address space"); + assert(SecondBaseTy->getAddressSpace() == 0 && "Unexpected address space"); + + int32_t FirstElementSize = + DL.getTypeSizeInBits(FirstBaseTy->getElementType()) / 8; + + Value *NewSecondBase; + if (FirstBaseTy == SecondBaseTy && Stride % FirstElementSize == 0) { + int32_t ElementStride = Stride / FirstElementSize; + Type *EltTy = FirstBaseTy->getPointerElementType(); + NewSecondBase = + IRB.CreateConstGEP1_32(EltTy, FirstBase, ElementStride, "postinc"); + } else { + Value *FirstBaseBytePtr = + IRB.CreateBitCast(FirstBase, IRB.getInt8PtrTy(), "oldbase.byteptr"); + Value *NewSecondBaseBytePtr = IRB.CreateConstGEP1_32( + IRB.getInt8Ty(), FirstBaseBytePtr, Stride, "postinc.byteptr"); + NewSecondBase = + IRB.CreateBitCast(NewSecondBaseBytePtr, SecondBaseTy, "postinc"); + } + Second.LdSt->replaceUsesOfWith(OldSecondBase, NewSecondBase); + LLVM_DEBUG(dbgs() << "New load/store: "; Second.LdSt->dump()); + RecursivelyDeleteTriviallyDeadInstructions(OldSecondBase, &TLI, nullptr); + return true; +} + +static bool isProfitable(const SmallVectorImpl &Instructions, + int32_t RegStride) { + if (Instructions.size() < 2) + return false; + + unsigned Matches = 1; // start from 1 since we look for pairs + for (auto I = Instructions.begin(), End = Instructions.end(); + std::next(I) != End; ++I) { + int32_t Stride = std::next(I)->Offset - I->Offset; + if (Stride == I->AccessSize || Stride == RegStride) + Matches++; + } + + if (Matches < 4 && Instructions.size() > Matches) { + // Bail out if there are other users of the base pointer and not a + // lot of consecutive accesses. + return false; + } + return true; +} + +static bool runOnBasicBlock(BasicBlock &BB, const TargetLibraryInfo &TLI) { + DenseMap> LdStMap; + const DataLayout &DL = BB.getModule()->getDataLayout(); + + // Collect relevant load/store instructions, grouped by guessed base address + for (auto &I : BB) { + Type *ValueTy; + unsigned BaseOperandIndex; + std::tie(ValueTy, BaseOperandIndex) = getDataTypeAndBaseIndex(&I); + if (!ValueTy || !ValueTy->isVectorTy()) + continue; + + unsigned AccessSize = DL.getTypeSizeInBits(ValueTy) / 8; + if (isPowerOf2_32(AccessSize)) { + LdStInfo LSI(DL, &I, BaseOperandIndex, AccessSize); + LdStMap[LSI.IndirectBase].push_back(std::move(LSI)); + LLVM_DEBUG(dbgs() << "found load/store, base: " << LSI.IndirectBase + << ", offset: " << LSI.Offset << '\n' + << I << '\n'); + } + } + + // For each group, form a chain of address increments + bool Modified = false; + for (auto BaseAndWorklist : LdStMap) { + auto Worklist = BaseAndWorklist.second; + assert(!Worklist.empty()); + int32_t RegStride = guessCustomAccessStride(Worklist); + if (!isProfitable(Worklist, RegStride)) { + LLVM_DEBUG(dbgs() << "not profitable to transform this ld/st group: " + << BaseAndWorklist.first << '\n'); + continue; + } + for (auto I = Worklist.begin(), E = Worklist.end(); std::next(I) != E; + ++I) { + Modified |= rewriteAddressCalculation(*I, *std::next(I), RegStride, TLI); + } + } + return Modified; +} + +bool ARMPostIndexingOpt::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + // If MVE is available, skip this function. + const auto &TPC = getAnalysis(); + const auto &TM = TPC.getTM(); + const auto &STI = TM.getSubtarget(F); + if (STI.hasMVEIntegerOps()) + return false; + + const auto &TLI = getAnalysis().getTLI(F); + + bool Modified = false; + for (auto &BB : F) + Modified |= runOnBasicBlock(BB, TLI); + return Modified; +} + +FunctionPass *llvm::createARMPostIndexingOptimizationPass() { + return new ARMPostIndexingOpt(); +} diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -90,6 +90,7 @@ PassRegistry &Registry = *PassRegistry::getPassRegistry(); initializeGlobalISel(Registry); initializeARMLoadStoreOptPass(Registry); + initializeARMPostIndexingOptPass(Registry); initializeARMPreAllocLoadStoreOptPass(Registry); initializeARMParallelDSPPass(Registry); initializeARMConstantIslandsPass(Registry); @@ -471,6 +472,7 @@ // any ISel takes place. We should have a more principled way of handling // this. See D99707 for more details. addPass(createBarrierNoopPass()); + addPass(createARMPostIndexingOptimizationPass()); } return false; diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt --- a/llvm/lib/Target/ARM/CMakeLists.txt +++ b/llvm/lib/Target/ARM/CMakeLists.txt @@ -47,6 +47,7 @@ ARMMacroFusion.cpp ARMRegisterInfo.cpp ARMOptimizeBarriersPass.cpp + ARMPostIndexingOptimizer.cpp ARMRegisterBankInfo.cpp ARMSelectionDAGInfo.cpp ARMSLSHardening.cpp diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -64,6 +64,7 @@ ; CHECK-NEXT: Transform predicated vector loops to use MVE tail predication ; CHECK-NEXT: A No-Op Barrier Pass ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: ARM post-indexed access optimizer ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors ; CHECK-NEXT: Module Verifier diff --git a/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll @@ -0,0 +1,343 @@ +; RUN: llc -o - < %s | FileCheck --check-prefix=ASM %s +; RUN: opt --arm-post-indexing-opt -S -o - < %s | FileCheck --check-prefix=IR %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv8-unknown-linux-gnueabihf" + +define <4 x float> @test(float* %A) { + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} +; IR-LABEL: define <4 x float> @test(float* %A) { +; IR-NEXT: %X.ptr = bitcast float* %A to <4 x float>* +; IR-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; IR-NEXT: %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1 +; IR-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; IR-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; IR-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; IR-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; IR-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; IR-NEXT: ret <4 x float> %sum +; IR-NEXT: } + +;ASM-LABEL: test: +;ASM-NOT: add +;ASM-NOT: sub +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE:[0-9]+]]]! +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]] + +define <4 x float> @test_stride(float* %A) { + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 12 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; IR-LABEL: define <4 x float> @test_stride(float* %A) { +; IR-NEXT: %X.ptr = bitcast float* %A to <4 x float>* +; IR-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; IR-NEXT: %oldbase.byteptr = bitcast <4 x float>* %X.ptr to i8* +; IR-NEXT: %postinc.byteptr = getelementptr i8, i8* %oldbase.byteptr, i32 24 +; IR-NEXT: %postinc = bitcast i8* %postinc.byteptr to <4 x float>* +; IR-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; IR-NEXT: %oldbase.byteptr1 = bitcast <4 x float>* %postinc to i8* +; IR-NEXT: %postinc.byteptr2 = getelementptr i8, i8* %oldbase.byteptr1, i32 24 +; IR-NEXT: %postinc3 = bitcast i8* %postinc.byteptr2 to <4 x float>* +; IR-NEXT: %Z = load <4 x float>, <4 x float>* %postinc3, align 4 +; IR-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; IR-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; IR-NEXT: ret <4 x float> %sum +; IR-NEXT: } + +; ASM-LABEL: test_stride: +; ASM: mov r[[STRIDE:[0-9]+]], #24 +; ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE:[0-9]+]]], r[[STRIDE]] +; ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]], r[[STRIDE]] +; ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]] + +define <4 x float> @test_stride_mixed(float* %A) { + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 10 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; IR-LABEL: define <4 x float> @test_stride_mixed(float* %A) { +; IR-NEXT: %X.ptr = bitcast float* %A to <4 x float>* +; IR-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; IR-NEXT: %oldbase.byteptr = bitcast <4 x float>* %X.ptr to i8* +; IR-NEXT: %postinc.byteptr = getelementptr i8, i8* %oldbase.byteptr, i32 24 +; IR-NEXT: %postinc = bitcast i8* %postinc.byteptr to <4 x float>* +; IR-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; IR-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; IR-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; IR-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; IR-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; IR-NEXT: ret <4 x float> %sum +; IR-NEXT: } + +; ASM-LABEL: test_stride_mixed: +; ASM: mov r[[STRIDE:[0-9]+]], #24 +; ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE:[0-9]+]]], r[[STRIDE]] +; ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +; ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]] + +; Refrain from using multiple stride registers +define <4 x float> @test_stride_noop(float* %A) { + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 14 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; IR-LABEL: define <4 x float> @test_stride_noop(float* %A) { +; IR-NEXT: %X.ptr = bitcast float* %A to <4 x float>* +; IR-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; IR-NEXT: %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 +; IR-NEXT: %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* +; IR-NEXT: %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 +; IR-NEXT: %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 14 +; IR-NEXT: %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* +; IR-NEXT: %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 +; IR-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; IR-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; IR-NEXT: ret <4 x float> %sum +; IR-NEXT: } + +define <4 x float> @test_positive_initial_offset(float* %A) { + %X.ptr.elt = getelementptr inbounds float, float* %A, i32 8 + %X.ptr = bitcast float* %X.ptr.elt to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 12 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 16 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; IR-LABEL: define <4 x float> @test_positive_initial_offset(float* %A) { +; IR-NEXT: %X.ptr.elt = getelementptr inbounds float, float* %A, i32 8 +; IR-NEXT: %X.ptr = bitcast float* %X.ptr.elt to <4 x float>* +; IR-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; IR-NEXT: %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1 +; IR-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; IR-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; IR-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; IR-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; IR-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; IR-NEXT: ret <4 x float> %sum +; IR-NEXT: } + +;ASM-LABEL: test_positive_initial_offset: +;ASM: add r[[BASE:[0-9]+]], r[[BASE]], #32 +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]] + +define <4 x float> @test_negative_initial_offset(float* %A) { + %X.ptr.elt = getelementptr inbounds float, float* %A, i32 -16 + %X.ptr = bitcast float* %X.ptr.elt to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 -12 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 -8 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; IR-LABEL: define <4 x float> @test_negative_initial_offset(float* %A) { +; IR-NEXT: %X.ptr.elt = getelementptr inbounds float, float* %A, i32 -16 +; IR-NEXT: %X.ptr = bitcast float* %X.ptr.elt to <4 x float>* +; IR-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; IR-NEXT: %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1 +; IR-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; IR-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; IR-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; IR-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; IR-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; IR-NEXT: ret <4 x float> %sum +; IR-NEXT: } + +;ASM-LABEL: test_negative_initial_offset: +;ASM: sub r[[BASE:[0-9]+]], r[[BASE]], #64 +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]] + +@global_float_array = external global [128 x float], align 4 +define <4 x float> @test_global() { + %X = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 8) to <4 x float>*), align 4 + %Y = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 12) to <4 x float>*), align 4 + %Z = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 16) to <4 x float>*), align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; IR-LABEL: define <4 x float> @test_global() { +; IR-NEXT: %1 = bitcast float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 8) to <4 x float>* +; IR-NEXT: %X = load <4 x float>, <4 x float>* %1, align 4 +; IR-NEXT: %postinc = getelementptr <4 x float>, <4 x float>* %1, i32 1 +; IR-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; IR-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; IR-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; IR-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; IR-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; IR-NEXT: ret <4 x float> %sum +; IR-NEXT: } + +;ASM-LABEL: test_global: +;ASM: add r[[BASE:[0-9]+]], r[[BASE]], #32 +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]] + +define <4 x float> @test_stack() { +; Use huge alignment to test that ADD would not be converted to OR + %array = alloca [32 x float], align 128 + %arraydecay = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 0 + call void @external_function(float* %arraydecay) + %X.ptr = bitcast [32 x float]* %array to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 4 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 8 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; IR-LABEL: define <4 x float> @test_stack() { +; IR-NEXT: %array = alloca [32 x float], align 128 +; IR-NEXT: %arraydecay = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 0 +; IR-NEXT: call void @external_function(float* %arraydecay) +; IR-NEXT: %X.ptr = bitcast [32 x float]* %array to <4 x float>* +; IR-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; IR-NEXT: %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1 +; IR-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; IR-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; IR-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; IR-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; IR-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; IR-NEXT: ret <4 x float> %sum +; IR-NEXT: } + +;ASM-LABEL: test_stack: +;ASM: bl external_function +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE:[0-9]+]]:128]! +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]:128]! +;ASM: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]] + +define <2 x double> @test_double(double* %A) { + %X.ptr.elt = getelementptr inbounds double, double* %A, i32 8 + %X.ptr = bitcast double* %X.ptr.elt to <2 x double>* + %X = load <2 x double>, <2 x double>* %X.ptr, align 8 + %Y.ptr.elt = getelementptr inbounds double, double* %A, i32 10 + %Y.ptr = bitcast double* %Y.ptr.elt to <2 x double>* + %Y = load <2 x double>, <2 x double>* %Y.ptr, align 8 + %Z.ptr.elt = getelementptr inbounds double, double* %A, i32 12 + %Z.ptr = bitcast double* %Z.ptr.elt to <2 x double>* + %Z = load <2 x double>, <2 x double>* %Z.ptr, align 8 + %tmp.sum = fadd <2 x double> %X, %Y + %sum = fadd <2 x double> %tmp.sum, %Z + ret <2 x double> %sum +} + +; IR-LABEL: define <2 x double> @test_double(double* %A) { +; IR-NEXT: %X.ptr.elt = getelementptr inbounds double, double* %A, i32 8 +; IR-NEXT: %X.ptr = bitcast double* %X.ptr.elt to <2 x double>* +; IR-NEXT: %X = load <2 x double>, <2 x double>* %X.ptr, align 8 +; IR-NEXT: %postinc = getelementptr <2 x double>, <2 x double>* %X.ptr, i32 1 +; IR-NEXT: %Y = load <2 x double>, <2 x double>* %postinc, align 8 +; IR-NEXT: %postinc1 = getelementptr <2 x double>, <2 x double>* %postinc, i32 1 +; IR-NEXT: %Z = load <2 x double>, <2 x double>* %postinc1, align 8 +; IR-NEXT: %tmp.sum = fadd <2 x double> %X, %Y +; IR-NEXT: %sum = fadd <2 x double> %tmp.sum, %Z +; IR-NEXT: ret <2 x double> %sum +; IR-NEXT: } + +;ASM-LABEL: test_double: +;ASM: add r[[BASE:[0-9]+]], r[[BASE]], #64 +;ASM: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +;ASM: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +;ASM: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]] + +define void @test_various_instructions(float* %A) { + %X.ptr = bitcast float* %A to i8* + %X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %X.ptr, i32 1) + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8 + %Z.ptr = bitcast float* %Z.ptr.elt to i8* + %Z = fadd <4 x float> %X, %Y + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %Z.ptr, <4 x float> %Z, i32 4) + ret void +} + +; IR-LABEL: define void @test_various_instructions(float* %A) { +; IR-NEXT: %X.ptr = bitcast float* %A to i8* +; IR-NEXT: %X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %X.ptr, i32 1) +; IR-NEXT: %postinc.byteptr = getelementptr i8, i8* %X.ptr, i32 16 +; IR-NEXT: %postinc = bitcast i8* %postinc.byteptr to <4 x float>* +; IR-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; IR-NEXT: %Z = fadd <4 x float> %X, %Y +; IR-NEXT: %oldbase.byteptr = bitcast <4 x float>* %postinc to i8* +; IR-NEXT: %postinc.byteptr1 = getelementptr i8, i8* %oldbase.byteptr, i32 16 +; IR-NEXT: tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %postinc.byteptr1, <4 x float> %Z, i32 4) +; IR-NEXT: ret void +; IR-NEXT: } + +; ASM-LABEL: test_various_instructions: +; ASM: @ %bb.0: +; ASM-NEXT: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE:[0-9]+]]]! +; ASM-NEXT: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]]! +; ASM-NEXT: vadd.f32 +; ASM-NEXT: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[BASE]]] +; ASM-NEXT: bx lr + +declare void @external_function(float*) +declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly +declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind argmemonly diff --git a/llvm/test/CodeGen/ARM/misched-fusion-aes.ll b/llvm/test/CodeGen/ARM/misched-fusion-aes.ll --- a/llvm/test/CodeGen/ARM/misched-fusion-aes.ll +++ b/llvm/test/CodeGen/ARM/misched-fusion-aes.ll @@ -76,9 +76,9 @@ ; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]] -; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]] +; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]] @@ -86,6 +86,7 @@ ; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]] +; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QF]] @@ -93,8 +94,6 @@ ; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]] -; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} - ; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]] } @@ -170,9 +169,9 @@ ; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]] -; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]] +; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]] @@ -180,6 +179,7 @@ ; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]] +; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QF]] @@ -187,7 +187,6 @@ ; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QG]] -; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QH]] } diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll --- a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll @@ -198,21 +198,13 @@ ; @testNeon is an important example of the nead for ivchains. ; -; Currently we have two extra add.w's that keep the store address -; live past the next increment because ISEL is unfortunately undoing -; the store chain. ISEL also fails to convert all but one of the stores to -; post-increment addressing. However, the loads should use -; post-increment addressing, no add's or add.w's beyond the three -; mentioned. Most importantly, there should be no spills or reloads! +; Loads and stores should use post-increment addressing, no add's or add.w's. +; Most importantly, there should be no spills or reloads! ; ; A9: testNeon: ; A9: %.lr.ph -; A9: add.w r ; A9-NOT: lsl.w ; A9-NOT: {{ldr|str|adds|add r}} -; A9: vst1.8 {{.*}} [r{{[0-9]+}}], r{{[0-9]+}} -; A9: add.w r -; A9-NOT: {{ldr|str|adds|add r}} ; A9-NOT: add.w r ; A9: bne define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize {