diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2381,6 +2381,15 @@ Type *Ty, unsigned AddrSpace, Instruction *I = nullptr) const; + /// Return true if it is beneficial to retain post-indexing-friendly patterns + /// while performing optimizations. + virtual bool shouldRetainImmediatePostIncrement(const DataLayout &DL, + Type *Ty, CombineLevel Level, + unsigned AddrSpace, + int64_t Increment) const { + return false; + } + /// Return the cost of the scaling factor used in the addressing mode /// represented by AM for this target, for a load/store of the specified type. /// diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -524,6 +524,7 @@ bool reassociationCanBreakAddressingModePattern(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1); + bool reassociationCanBreakPostIndexingPattern(SDNode *N); SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1); SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, @@ -1053,6 +1054,37 @@ return false; } +bool DAGCombiner::reassociationCanBreakPostIndexingPattern(SDNode *N) { + const DataLayout &DL = DAG.getDataLayout(); + if (N->getOpcode() != ISD::ADD) + return false; + + auto Const = dyn_cast(N->getOperand(1)); + if (!Const) + return false; + + const APInt &APIntVal = Const->getAPIntValue(); + if (APIntVal.getBitWidth() > 64) + return false; + const int64_t ConstValue = APIntVal.getSExtValue(); + + // Check for (load/store (add x, const)) + + for (SDNode *Node : N->getOperand(0)->uses()) { + auto LoadStore = dyn_cast(Node); + if (!LoadStore) + continue; + + EVT VT = LoadStore->getMemoryVT(); + Type *AccessTy = VT.getTypeForEVT(*DAG.getContext()); + unsigned AS = LoadStore->getAddressSpace(); + if (TLI.shouldRetainImmediatePostIncrement(DL, AccessTy, Level, AS, + ConstValue)) + return true; + } + return false; +} + // Helper for DAGCombiner::reassociateOps. Try to reassociate an expression // such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc. SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, @@ -2488,6 +2520,13 @@ EVT VT = N0.getValueType(); SDLoc DL(N); + // Prevent ADD reassociation as well as converting ADD -> OR + if (reassociationCanBreakPostIndexingPattern(N) || + reassociationCanBreakPostIndexingPattern(N0.getNode()) || + reassociationCanBreakPostIndexingPattern(N1.getNode())) { + return SDValue(); + } + if (SDValue Combined = visitADDLike(N)) return Combined; diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h --- a/llvm/lib/Target/ARM/ARM.h +++ b/llvm/lib/Target/ARM/ARM.h @@ -43,6 +43,7 @@ CodeGenOpt::Level OptLevel); FunctionPass *createA15SDOptimizerPass(); FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); +FunctionPass *createARMPostIndexingOptimizationPass(); FunctionPass *createARMExpandPseudoPass(); FunctionPass *createARMConstantIslandPass(); FunctionPass *createMLxExpansionPass(); @@ -65,6 +66,7 @@ void initializeARMParallelDSPPass(PassRegistry &); void initializeARMLoadStoreOptPass(PassRegistry &); +void initializeARMPostIndexingOptPass(PassRegistry &); void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); void initializeARMConstantIslandsPass(PassRegistry &); void initializeARMExpandPseudoPass(PassRegistry &); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -469,6 +469,11 @@ Type *Ty, unsigned AS, Instruction *I = nullptr) const override; + bool shouldRetainImmediatePostIncrement(const DataLayout &DL, Type *Ty, + CombineLevel Level, + unsigned AddrSpace, + int64_t Increment) const override; + /// getScalingFactorCost - Return the cost of the scaling used in /// addressing mode represented by AM. /// If the AM is supported, the return value must be >= 0. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -18707,6 +18707,27 @@ return true; } +bool ARMTargetLowering::shouldRetainImmediatePostIncrement( + const DataLayout &DL, Type *Ty, CombineLevel Level, unsigned AddrSpace, + int64_t Increment) const { + // NEON has rather restricted address calculation for vector load / store + // instructions compared to MVE or AArch64 ASIMD. + if (Subtarget->hasMVEIntegerOps()) + return false; + + // If the first DAG optimization pass did not consume this increment, + // try combining as usual during subsequent optimization passes. + if (Level != CombineLevel::BeforeLegalizeTypes) + return false; + + if (!Ty->isVectorTy()) + return false; + + unsigned BitSize = DL.getTypeSizeInBits(Ty); + + return BitSize > 64 && isPowerOf2_32(BitSize); +} + /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can compare /// a register against the immediate without having to materialize the diff --git a/llvm/lib/Target/ARM/ARMPostIndexingOptimizer.cpp b/llvm/lib/Target/ARM/ARMPostIndexingOptimizer.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/ARM/ARMPostIndexingOptimizer.cpp @@ -0,0 +1,335 @@ +//===- ARMPostIndexingOptimizer.cpp - Prepare ld/st for post-indexing -----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file ARMPostIndexingOpt pass transforms address operands of load/store +/// instructions to allow them to be emitted as NEON load/store with post-index +/// addressing mode. For example: +/// +/// %first = gep %base, %offset1 +/// load %first +/// %second = gep %base, %offset2 +/// load %second +/// +/// this sequence may be tranformed into: +/// +/// %first = gep %base, %offset1 +/// load %first +/// %second = gep %first, (%offset2 - %offset1) +/// load %second +/// +/// The transformation is done only if: +/// 1) GEPs are constant +/// 2) Difference between offsets is compatible with either "[Rn]!" +/// or "[Rn], Rm" addressing modes. +// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMSubtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsARM.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-post-indexing-opt" +#define PASS_DESC "ARM post-indexed access optimizer" + +namespace { + +// LdStInfo the base address of a load or store, and an immediate offset of a +// memory access obtained by a best-effort heuristic as well as other useful +// instruction properties. +struct LdStInfo { + LdStInfo(const DataLayout &DL, Instruction *LdSt, unsigned BaseOperandIndex, + int AccessSize); + // A memory access in question + Instruction *LdSt; + // An actual operand of LdSt can be updated throughout this pass execution, + // so store an index instead + unsigned BaseOperandIndex; + // A guessed base address + Value *IndirectBase; + // An immediate offset to add to IndirectBase + int32_t Offset; + // An access size that can be used for post-indexed addressing mode + int AccessSize; + + // Returns current *direct* base operand + Value *getBaseOperand() { return LdSt->getOperand(BaseOperandIndex); } +}; + +struct ARMPostIndexingOpt : public FunctionPass { + static char ID; + + ARMPostIndexingOpt() : FunctionPass(ID) {} + + StringRef getPassName() const override { return PASS_DESC; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesCFG(); + } + + bool runOnFunction(Function &F) override; +}; + +} // end anonymous namespace + +char ARMPostIndexingOpt::ID = 0; +INITIALIZE_PASS(ARMPostIndexingOpt, DEBUG_TYPE, PASS_DESC, false, false) + +// Returns (nullptr, 0) for instructions not handled by this pass +static std::pair getDataTypeAndBaseIndex(Instruction *I) { + if (LoadInst *Load = dyn_cast(I)) + return std::make_pair(Load->getType(), 0); + if (StoreInst *Store = dyn_cast(I)) + return std::make_pair(Store->getValueOperand()->getType(), 1); + + if (IntrinsicInst *Intrinsic = dyn_cast(I)) { + switch (Intrinsic->getIntrinsicID()) { + case Intrinsic::arm_neon_vld1: + case Intrinsic::arm_neon_vld2: + case Intrinsic::arm_neon_vld3: + case Intrinsic::arm_neon_vld4: + return std::make_pair(Intrinsic->getType(), 0); + case Intrinsic::arm_neon_vst1: + case Intrinsic::arm_neon_vst2: + case Intrinsic::arm_neon_vst3: + case Intrinsic::arm_neon_vst4: + return std::make_pair(Intrinsic->getOperand(1)->getType(), 0); + default: + break; + } + } + return std::make_pair(nullptr, 0); +} + +LdStInfo::LdStInfo(const DataLayout &DL, Instruction *LdSt, + unsigned BaseOperandIndex, int AccessSize) + : LdSt(LdSt), BaseOperandIndex(BaseOperandIndex), Offset(0), + AccessSize(AccessSize) { + IndirectBase = LdSt->getOperand(BaseOperandIndex); + for (;;) { + IndirectBase = IndirectBase->stripPointerCasts(); + // Match GetElementPtrInst as well as corresponding ContantExpr + if (auto *GEP = dyn_cast(IndirectBase)) { + APInt APOffset(32, 0, /* isSigned = */ true); + if (GEP->accumulateConstantOffset(DL, APOffset)) { + IndirectBase = GEP->getPointerOperand(); + Offset += APOffset.getSExtValue(); + continue; + } + } + return; + } +} + +// Guess a common stride (aside from post-incrementing by access size) that +// is suitable for "[Rn], Rm" addressing mode, if any +static int32_t guessCustomAccessStride(ArrayRef Instructions) { + int32_t Stride = 0; // not decided + assert(!Instructions.empty()); + for (auto I = Instructions.begin(), End = Instructions.end(); + std::next(I) != End; ++I) { + int32_t ThisStride = std::next(I)->Offset - I->Offset; + // Check if "[Rn]!" addressing mode can be used + if (ThisStride == I->AccessSize) + continue; + // If this is the first instruction requiring a register operand, + // request this stride value + if (Stride == 0) + Stride = ThisStride; + // If multiple different stride values have to be used, + // conservatively refrain from using "[Rn], Rm" addressing mode + if (Stride != ThisStride) + return 0; + } + return Stride; +} + +// Rewrite a memory address used by Second to use address of First incremented +// by a constant value +static bool rewriteAddressCalculation(LdStInfo &First, LdStInfo &Second, + int32_t RegStride, + const TargetLibraryInfo &TLI) { + LLVM_DEBUG(dbgs() << "Rewriting load/store for post-indexing: "; + Second.LdSt->dump()); + + IRBuilder<> IRB(Second.LdSt); + const DataLayout &DL = Second.LdSt->getModule()->getDataLayout(); + + int32_t Stride = Second.Offset - First.Offset; + if (Stride != First.AccessSize && Stride != RegStride) + return false; + + // In case GEPOperand matched ContantExpr, replace it by instruction to + // prevent folding + if (auto Const = dyn_cast(First.getBaseOperand())) { + auto Inst = Const->getAsInstruction(); + Inst->insertBefore(First.LdSt); + First.LdSt->replaceUsesOfWith(Const, Inst); + } + + Value *FirstBase = First.getBaseOperand(); + Value *OldSecondBase = Second.getBaseOperand(); + PointerType *FirstBaseTy = cast(FirstBase->getType()); + PointerType *SecondBaseTy = cast(OldSecondBase->getType()); + assert(FirstBaseTy->getAddressSpace() == 0 && "Unexpected address space"); + assert(SecondBaseTy->getAddressSpace() == 0 && "Unexpected address space"); + + int32_t FirstElementSize = + DL.getTypeSizeInBits(FirstBaseTy->getElementType()) / 8; + + Value *NewSecondBase; + if (FirstBaseTy == SecondBaseTy && Stride % FirstElementSize == 0) { + int32_t ElementStride = Stride / FirstElementSize; + Type *EltTy = FirstBaseTy->getPointerElementType(); + NewSecondBase = + IRB.CreateConstGEP1_32(EltTy, FirstBase, ElementStride, "postinc"); + } else { + Value *FirstBaseBytePtr = + IRB.CreateBitCast(FirstBase, IRB.getInt8PtrTy(), "oldbase.byteptr"); + Value *NewSecondBaseBytePtr = IRB.CreateConstGEP1_32( + IRB.getInt8Ty(), FirstBaseBytePtr, Stride, "postinc.byteptr"); + NewSecondBase = + IRB.CreateBitCast(NewSecondBaseBytePtr, SecondBaseTy, "postinc"); + } + Second.LdSt->replaceUsesOfWith(OldSecondBase, NewSecondBase); + LLVM_DEBUG(dbgs() << "New load/store: "; Second.LdSt->dump()); + RecursivelyDeleteTriviallyDeadInstructions(OldSecondBase, &TLI, nullptr); + return true; +} + +static bool isProfitable(const SmallVectorImpl &Instructions, + int32_t RegStride) { + if (Instructions.size() < 2) + return false; + + unsigned Matches = 1; // start from 1 since we look for pairs + for (auto I = Instructions.begin(), End = Instructions.end(); + std::next(I) != End; ++I) { + int32_t Stride = std::next(I)->Offset - I->Offset; + if (Stride == I->AccessSize || Stride == RegStride) + Matches++; + } + + if (Matches < 4 && Instructions.size() > Matches) { + // Bail out if there are other users of the base pointer and not a + // lot of consecutive accesses. + return false; + } + return true; +} + +static void refineIndirectBase(SmallVectorImpl &LdSt) { + // Match the following pattern: + // + // %base1 = gep %truebase, %i + // %ptr1 = gep %base1, 0 + // load %ptr1 + // %base2 = gep %truebase, %i + // %ptr2 = gep %base2, 4 + // load %ptr2 + // + // Here base1 and base2 are indirect bases in LdSt. Use the same + // value for all corresponding elements. + for (auto I = LdSt.begin(), End = LdSt.end(); std::next(I) != End; ++I) { + if (auto *GEP = dyn_cast(I->IndirectBase)) { + if (auto *GEPNext = + dyn_cast(std::next(I)->IndirectBase)) { + if (GEP->getPointerOperand() != GEPNext->getPointerOperand() || + GEP->getNumIndices() != GEPNext->getNumIndices() || + !std::equal(GEP->idx_begin(), GEP->idx_end(), GEPNext->idx_begin())) + continue; + + // GEPs are equal + std::next(I)->IndirectBase = GEP; + } + } + } +} + +static bool runOnBasicBlock(BasicBlock &BB, const TargetLibraryInfo &TLI) { + const DataLayout &DL = BB.getModule()->getDataLayout(); + + // Collect relevant load/store instructions, grouped by guessed base address + SmallVector LdSt; + for (Instruction &I : BB) { + Type *ValueTy; + unsigned BaseOperandIndex; + std::tie(ValueTy, BaseOperandIndex) = getDataTypeAndBaseIndex(&I); + if (!ValueTy || !ValueTy->isVectorTy()) + continue; + + unsigned AccessSize = DL.getTypeSizeInBits(ValueTy) / 8; + if (isPowerOf2_32(AccessSize)) + LdSt.emplace_back(DL, &I, BaseOperandIndex, AccessSize); + } + + if (LdSt.size() < 2) + return false; + + refineIndirectBase(LdSt); + + DenseMap> LdStMap; + for (LdStInfo &LSI : LdSt) { + LdStMap[LSI.IndirectBase].push_back(std::move(LSI)); + LLVM_DEBUG(dbgs() << "found load/store, base: " << LSI.IndirectBase + << ", offset: " << LSI.Offset << '\n' + << LSI.LdSt << '\n'); + } + // For each group, form a chain of address increments + bool Modified = false; + for (auto BaseAndWorklist : LdStMap) { + auto Worklist = BaseAndWorklist.second; + assert(!Worklist.empty()); + int32_t RegStride = guessCustomAccessStride(Worklist); + if (!isProfitable(Worklist, RegStride)) { + LLVM_DEBUG(dbgs() << "not profitable to transform this ld/st group: " + << BaseAndWorklist.first << '\n'); + continue; + } + for (auto I = Worklist.begin(), E = Worklist.end(); std::next(I) != E; + ++I) { + Modified |= rewriteAddressCalculation(*I, *std::next(I), RegStride, TLI); + } + } + return Modified; +} + +bool ARMPostIndexingOpt::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + // If MVE is available, skip this function. + const auto &TPC = getAnalysis(); + const auto &TM = TPC.getTM(); + const auto &STI = TM.getSubtarget(F); + if (STI.hasMVEIntegerOps()) + return false; + + const auto &TLI = getAnalysis().getTLI(F); + + bool Modified = false; + for (auto &BB : F) + Modified |= runOnBasicBlock(BB, TLI); + return Modified; +} + +FunctionPass *llvm::createARMPostIndexingOptimizationPass() { + return new ARMPostIndexingOpt(); +} diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -90,6 +90,7 @@ PassRegistry &Registry = *PassRegistry::getPassRegistry(); initializeGlobalISel(Registry); initializeARMLoadStoreOptPass(Registry); + initializeARMPostIndexingOptPass(Registry); initializeARMPreAllocLoadStoreOptPass(Registry); initializeARMParallelDSPPass(Registry); initializeARMConstantIslandsPass(Registry); @@ -471,6 +472,7 @@ // any ISel takes place. We should have a more principled way of handling // this. See D99707 for more details. addPass(createBarrierNoopPass()); + addPass(createARMPostIndexingOptimizationPass()); } return false; diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt --- a/llvm/lib/Target/ARM/CMakeLists.txt +++ b/llvm/lib/Target/ARM/CMakeLists.txt @@ -47,6 +47,7 @@ ARMMacroFusion.cpp ARMRegisterInfo.cpp ARMOptimizeBarriersPass.cpp + ARMPostIndexingOptimizer.cpp ARMRegisterBankInfo.cpp ARMSelectionDAGInfo.cpp ARMSLSHardening.cpp diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -64,6 +64,7 @@ ; CHECK-NEXT: Transform predicated vector loops to use MVE tail predication ; CHECK-NEXT: A No-Op Barrier Pass ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: ARM post-indexed access optimizer ; CHECK-NEXT: Safe Stack instrumentation pass ; CHECK-NEXT: Insert stack protectors ; CHECK-NEXT: Module Verifier diff --git a/llvm/test/CodeGen/ARM/arm-post-indexing-opt-ir.ll b/llvm/test/CodeGen/ARM/arm-post-indexing-opt-ir.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/arm-post-indexing-opt-ir.ll @@ -0,0 +1,354 @@ +; RUN: opt --arm-post-indexing-opt -S -o - < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv8-unknown-linux-gnueabihf" + +define <4 x float> @test(float* %A) { + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} +; CHECK-LABEL: define <4 x float> @test(float* %A) { +; CHECK-NEXT: %X.ptr = bitcast float* %A to <4 x float>* +; CHECK-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; CHECK-NEXT: %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1 +; CHECK-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; CHECK-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; CHECK-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; CHECK-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; CHECK-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; CHECK-NEXT: ret <4 x float> %sum +; CHECK-NEXT: } + +define <4 x float> @test_stride(float* %A) { + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 12 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; CHECK-LABEL: define <4 x float> @test_stride(float* %A) { +; CHECK-NEXT: %X.ptr = bitcast float* %A to <4 x float>* +; CHECK-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; CHECK-NEXT: %oldbase.byteptr = bitcast <4 x float>* %X.ptr to i8* +; CHECK-NEXT: %postinc.byteptr = getelementptr i8, i8* %oldbase.byteptr, i32 24 +; CHECK-NEXT: %postinc = bitcast i8* %postinc.byteptr to <4 x float>* +; CHECK-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; CHECK-NEXT: %oldbase.byteptr1 = bitcast <4 x float>* %postinc to i8* +; CHECK-NEXT: %postinc.byteptr2 = getelementptr i8, i8* %oldbase.byteptr1, i32 24 +; CHECK-NEXT: %postinc3 = bitcast i8* %postinc.byteptr2 to <4 x float>* +; CHECK-NEXT: %Z = load <4 x float>, <4 x float>* %postinc3, align 4 +; CHECK-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; CHECK-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; CHECK-NEXT: ret <4 x float> %sum +; CHECK-NEXT: } + +define <4 x float> @test_stride_mixed(float* %A) { + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 10 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; CHECK-LABEL: define <4 x float> @test_stride_mixed(float* %A) { +; CHECK-NEXT: %X.ptr = bitcast float* %A to <4 x float>* +; CHECK-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; CHECK-NEXT: %oldbase.byteptr = bitcast <4 x float>* %X.ptr to i8* +; CHECK-NEXT: %postinc.byteptr = getelementptr i8, i8* %oldbase.byteptr, i32 24 +; CHECK-NEXT: %postinc = bitcast i8* %postinc.byteptr to <4 x float>* +; CHECK-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; CHECK-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; CHECK-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; CHECK-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; CHECK-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; CHECK-NEXT: ret <4 x float> %sum +; CHECK-NEXT: } + +; Refrain from using multiple stride registers +define <4 x float> @test_stride_noop(float* %A) { + %X.ptr = bitcast float* %A to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 14 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; CHECK-LABEL: define <4 x float> @test_stride_noop(float* %A) { +; CHECK-NEXT: %X.ptr = bitcast float* %A to <4 x float>* +; CHECK-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; CHECK-NEXT: %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 6 +; CHECK-NEXT: %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* +; CHECK-NEXT: %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 +; CHECK-NEXT: %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 14 +; CHECK-NEXT: %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* +; CHECK-NEXT: %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 +; CHECK-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; CHECK-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; CHECK-NEXT: ret <4 x float> %sum +; CHECK-NEXT: } + +define <4 x float> @test_positive_initial_offset(float* %A) { + %X.ptr.elt = getelementptr inbounds float, float* %A, i32 8 + %X.ptr = bitcast float* %X.ptr.elt to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 12 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 16 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; CHECK-LABEL: define <4 x float> @test_positive_initial_offset(float* %A) { +; CHECK-NEXT: %X.ptr.elt = getelementptr inbounds float, float* %A, i32 8 +; CHECK-NEXT: %X.ptr = bitcast float* %X.ptr.elt to <4 x float>* +; CHECK-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; CHECK-NEXT: %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1 +; CHECK-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; CHECK-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; CHECK-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; CHECK-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; CHECK-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; CHECK-NEXT: ret <4 x float> %sum +; CHECK-NEXT: } + +define <4 x float> @test_negative_initial_offset(float* %A) { + %X.ptr.elt = getelementptr inbounds float, float* %A, i32 -16 + %X.ptr = bitcast float* %X.ptr.elt to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 -12 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 -8 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; CHECK-LABEL: define <4 x float> @test_negative_initial_offset(float* %A) { +; CHECK-NEXT: %X.ptr.elt = getelementptr inbounds float, float* %A, i32 -16 +; CHECK-NEXT: %X.ptr = bitcast float* %X.ptr.elt to <4 x float>* +; CHECK-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; CHECK-NEXT: %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1 +; CHECK-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; CHECK-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; CHECK-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; CHECK-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; CHECK-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; CHECK-NEXT: ret <4 x float> %sum +; CHECK-NEXT: } + +@global_float_array = external global [128 x float], align 4 +define <4 x float> @test_global() { + %X = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 8) to <4 x float>*), align 4 + %Y = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 12) to <4 x float>*), align 4 + %Z = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 16) to <4 x float>*), align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; CHECK-LABEL: define <4 x float> @test_global() { +; CHECK-NEXT: %1 = bitcast float* getelementptr inbounds ([128 x float], [128 x float]* @global_float_array, i32 0, i32 8) to <4 x float>* +; CHECK-NEXT: %X = load <4 x float>, <4 x float>* %1, align 4 +; CHECK-NEXT: %postinc = getelementptr <4 x float>, <4 x float>* %1, i32 1 +; CHECK-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; CHECK-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; CHECK-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; CHECK-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; CHECK-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; CHECK-NEXT: ret <4 x float> %sum +; CHECK-NEXT: } + +define <4 x float> @test_stack() { +; Use huge alignment to test that ADD would not be converted to OR + %array = alloca [32 x float], align 128 + %arraydecay = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 0 + call void @external_function(float* %arraydecay) + %X.ptr = bitcast [32 x float]* %array to <4 x float>* + %X = load <4 x float>, <4 x float>* %X.ptr, align 4 + %Y.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 4 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 8 + %Z.ptr = bitcast float* %Z.ptr.elt to <4 x float>* + %Z = load <4 x float>, <4 x float>* %Z.ptr, align 4 + %tmp.sum = fadd <4 x float> %X, %Y + %sum = fadd <4 x float> %tmp.sum, %Z + ret <4 x float> %sum +} + +; CHECK-LABEL: define <4 x float> @test_stack() { +; CHECK-NEXT: %array = alloca [32 x float], align 128 +; CHECK-NEXT: %arraydecay = getelementptr inbounds [32 x float], [32 x float]* %array, i32 0, i32 0 +; CHECK-NEXT: call void @external_function(float* %arraydecay) +; CHECK-NEXT: %X.ptr = bitcast [32 x float]* %array to <4 x float>* +; CHECK-NEXT: %X = load <4 x float>, <4 x float>* %X.ptr, align 4 +; CHECK-NEXT: %postinc = getelementptr <4 x float>, <4 x float>* %X.ptr, i32 1 +; CHECK-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; CHECK-NEXT: %postinc1 = getelementptr <4 x float>, <4 x float>* %postinc, i32 1 +; CHECK-NEXT: %Z = load <4 x float>, <4 x float>* %postinc1, align 4 +; CHECK-NEXT: %tmp.sum = fadd <4 x float> %X, %Y +; CHECK-NEXT: %sum = fadd <4 x float> %tmp.sum, %Z +; CHECK-NEXT: ret <4 x float> %sum +; CHECK-NEXT: } + +define <2 x double> @test_double(double* %A) { + %X.ptr.elt = getelementptr inbounds double, double* %A, i32 8 + %X.ptr = bitcast double* %X.ptr.elt to <2 x double>* + %X = load <2 x double>, <2 x double>* %X.ptr, align 8 + %Y.ptr.elt = getelementptr inbounds double, double* %A, i32 10 + %Y.ptr = bitcast double* %Y.ptr.elt to <2 x double>* + %Y = load <2 x double>, <2 x double>* %Y.ptr, align 8 + %Z.ptr.elt = getelementptr inbounds double, double* %A, i32 12 + %Z.ptr = bitcast double* %Z.ptr.elt to <2 x double>* + %Z = load <2 x double>, <2 x double>* %Z.ptr, align 8 + %tmp.sum = fadd <2 x double> %X, %Y + %sum = fadd <2 x double> %tmp.sum, %Z + ret <2 x double> %sum +} + +; CHECK-LABEL: define <2 x double> @test_double(double* %A) { +; CHECK-NEXT: %X.ptr.elt = getelementptr inbounds double, double* %A, i32 8 +; CHECK-NEXT: %X.ptr = bitcast double* %X.ptr.elt to <2 x double>* +; CHECK-NEXT: %X = load <2 x double>, <2 x double>* %X.ptr, align 8 +; CHECK-NEXT: %postinc = getelementptr <2 x double>, <2 x double>* %X.ptr, i32 1 +; CHECK-NEXT: %Y = load <2 x double>, <2 x double>* %postinc, align 8 +; CHECK-NEXT: %postinc1 = getelementptr <2 x double>, <2 x double>* %postinc, i32 1 +; CHECK-NEXT: %Z = load <2 x double>, <2 x double>* %postinc1, align 8 +; CHECK-NEXT: %tmp.sum = fadd <2 x double> %X, %Y +; CHECK-NEXT: %sum = fadd <2 x double> %tmp.sum, %Z +; CHECK-NEXT: ret <2 x double> %sum +; CHECK-NEXT: } + +define void @test_various_instructions(float* %A) { + %X.ptr = bitcast float* %A to i8* + %X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %X.ptr, i32 1) + %Y.ptr.elt = getelementptr inbounds float, float* %A, i32 4 + %Y.ptr = bitcast float* %Y.ptr.elt to <4 x float>* + %Y = load <4 x float>, <4 x float>* %Y.ptr, align 4 + %Z.ptr.elt = getelementptr inbounds float, float* %A, i32 8 + %Z.ptr = bitcast float* %Z.ptr.elt to i8* + %Z = fadd <4 x float> %X, %Y + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %Z.ptr, <4 x float> %Z, i32 4) + ret void +} + +; CHECK-LABEL: define void @test_various_instructions(float* %A) { +; CHECK-NEXT: %X.ptr = bitcast float* %A to i8* +; CHECK-NEXT: %X = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* %X.ptr, i32 1) +; CHECK-NEXT: %postinc.byteptr = getelementptr i8, i8* %X.ptr, i32 16 +; CHECK-NEXT: %postinc = bitcast i8* %postinc.byteptr to <4 x float>* +; CHECK-NEXT: %Y = load <4 x float>, <4 x float>* %postinc, align 4 +; CHECK-NEXT: %Z = fadd <4 x float> %X, %Y +; CHECK-NEXT: %oldbase.byteptr = bitcast <4 x float>* %postinc to i8* +; CHECK-NEXT: %postinc.byteptr1 = getelementptr i8, i8* %oldbase.byteptr, i32 16 +; CHECK-NEXT: tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %postinc.byteptr1, <4 x float> %Z, i32 4) +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +define void @test_lsr_geps(float* %a, float* %b, i32 %n) { +entry: + %cmp61 = icmp sgt i32 %n, 0 + br i1 %cmp61, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %lsr.iv1 = phi i32 [ 0, %for.body.preheader ], [ %lsr.iv.next2, %for.body ] + %lsr.iv = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %0 = bitcast float* %a to i8* + %1 = bitcast float* %b to i8* + %uglygep19 = getelementptr i8, i8* %0, i32 %lsr.iv1 + %uglygep1920 = bitcast i8* %uglygep19 to <4 x float>* + %2 = load <4 x float>, <4 x float>* %uglygep1920, align 4 + %uglygep16 = getelementptr i8, i8* %0, i32 %lsr.iv1 + %uglygep1617 = bitcast i8* %uglygep16 to <4 x float>* + %scevgep18 = getelementptr <4 x float>, <4 x float>* %uglygep1617, i32 1 + %3 = load <4 x float>, <4 x float>* %scevgep18, align 4 + %uglygep13 = getelementptr i8, i8* %0, i32 %lsr.iv1 + %uglygep1314 = bitcast i8* %uglygep13 to <4 x float>* + %scevgep15 = getelementptr <4 x float>, <4 x float>* %uglygep1314, i32 2 + %4 = load <4 x float>, <4 x float>* %scevgep15, align 4 + %uglygep10 = getelementptr i8, i8* %0, i32 %lsr.iv1 + %uglygep1011 = bitcast i8* %uglygep10 to <4 x float>* + %scevgep12 = getelementptr <4 x float>, <4 x float>* %uglygep1011, i32 3 + %5 = load <4 x float>, <4 x float>* %scevgep12, align 4 + %uglygep8 = getelementptr i8, i8* %1, i32 %lsr.iv1 + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* %uglygep8, <4 x float> %2, i32 4) + %uglygep6 = getelementptr i8, i8* %1, i32 %lsr.iv1 + %scevgep7 = getelementptr i8, i8* %uglygep6, i32 16 + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep7, <4 x float> %3, i32 4) + %uglygep4 = getelementptr i8, i8* %1, i32 %lsr.iv1 + %scevgep5 = getelementptr i8, i8* %uglygep4, i32 32 + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep5, <4 x float> %4, i32 4) + %uglygep = getelementptr i8, i8* %1, i32 %lsr.iv1 + %scevgep = getelementptr i8, i8* %uglygep, i32 48 + tail call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* nonnull %scevgep, <4 x float> %5, i32 4) + %lsr.iv.next = add i32 %lsr.iv, -1 + %lsr.iv.next2 = add nuw i32 %lsr.iv1, 64 + %exitcond.not = icmp eq i32 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +; CHECK-LABEL: define void @test_lsr_geps(float* %a, float* %b, i32 %n) +; +; CHECK: %[[GEP:uglygep[0-9]+]] = getelementptr i8, i8* %0, i32 %lsr.iv1 +; CHECK-NEXT: %[[BASE:uglygep[0-9]+]] = bitcast i8* %[[GEP]] to <4 x float>* +; CHECK-NEXT: load <4 x float>, <4 x float>* %[[BASE]], align 4 +; CHECK-NEXT: %[[POSTINC1:postinc[0-9]+]] = getelementptr <4 x float>, <4 x float>* %[[BASE]], i32 1 +; CHECK-NEXT: load <4 x float>, <4 x float>* %[[POSTINC1]], align 4 +; CHECK-NEXT: %[[POSTINC2:postinc[0-9]+]] = getelementptr <4 x float>, <4 x float>* %[[POSTINC1]], i32 1 +; CHECK-NEXT: load <4 x float>, <4 x float>* %[[POSTINC2]], align 4 +; CHECK-NEXT: %[[POSTINC3:postinc[0-9]+]] = getelementptr <4 x float>, <4 x float>* %[[POSTINC2]], i32 1 +; CHECK-NEXT: load <4 x float>, <4 x float>* %[[POSTINC3]], align 4 +; +; CHECK-NEXT: %[[BASE:uglygep[0-9]+]] = getelementptr i8, i8* %1, i32 %lsr.iv1 +; CHECK-NEXT: tail call void @llvm.arm.neon.vst1.{{.*}} %[[BASE]] +; CHECK-NEXT: %[[POSTINC1:postinc]] = getelementptr i8, i8* %[[BASE]], i32 16 +; CHECK-NEXT: tail call void @llvm.arm.neon.vst1.{{.*}} %[[POSTINC1]] +; CHECK-NEXT: %[[POSTINC2:postinc[0-9]+]] = getelementptr i8, i8* %[[POSTINC1]], i32 16 +; CHECK-NEXT: tail call void @llvm.arm.neon.vst1.{{.*}} %[[POSTINC2]] +; CHECK-NEXT: %[[POSTINC3:postinc[0-9]+]] = getelementptr i8, i8* %[[POSTINC2]], i32 16 +; CHECK-NEXT: tail call void @llvm.arm.neon.vst1.{{.*}} %[[POSTINC3]] + +declare void @external_function(float*) +declare <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8*, i32) nounwind readonly +declare void @llvm.arm.neon.vst1.p0i8.v4f32(i8*, <4 x float>, i32) nounwind argmemonly diff --git a/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll --- a/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll +++ b/llvm/test/CodeGen/ARM/arm-post-indexing-opt.ll @@ -7,10 +7,8 @@ define <4 x float> @test(float* %A) { ; CHECK-LABEL: test: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r2, r0, #16 -; CHECK-NEXT: mov r1, #32 -; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 -; CHECK-NEXT: vld1.32 {d18, d19}, [r2] +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! ; CHECK-NEXT: vadd.f32 q8, q8, q9 ; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: vadd.f32 q0, q8, q9 @@ -31,10 +29,9 @@ define <4 x float> @test_stride(float* %A) { ; CHECK-LABEL: test_stride: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r2, r0, #24 -; CHECK-NEXT: mov r1, #48 +; CHECK-NEXT: mov r1, #24 ; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 -; CHECK-NEXT: vld1.32 {d18, d19}, [r2] +; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1 ; CHECK-NEXT: vadd.f32 q8, q8, q9 ; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: vadd.f32 q0, q8, q9 @@ -55,10 +52,9 @@ define <4 x float> @test_stride_mixed(float* %A) { ; CHECK-LABEL: test_stride_mixed: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r2, r0, #24 -; CHECK-NEXT: mov r1, #40 +; CHECK-NEXT: mov r1, #24 ; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 -; CHECK-NEXT: vld1.32 {d18, d19}, [r2] +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! ; CHECK-NEXT: vadd.f32 q8, q8, q9 ; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: vadd.f32 q0, q8, q9 @@ -80,10 +76,10 @@ define <4 x float> @test_stride_noop(float* %A) { ; CHECK-LABEL: test_stride_noop: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r2, r0, #24 -; CHECK-NEXT: mov r1, #56 +; CHECK-NEXT: mov r1, #24 ; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 -; CHECK-NEXT: vld1.32 {d18, d19}, [r2] +; CHECK-NEXT: mov r1, #32 +; CHECK-NEXT: vld1.32 {d18, d19}, [r0], r1 ; CHECK-NEXT: vadd.f32 q8, q8, q9 ; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: vadd.f32 q0, q8, q9 @@ -104,12 +100,10 @@ define <4 x float> @test_positive_initial_offset(float* %A) { ; CHECK-LABEL: test_positive_initial_offset: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r1, r0, #48 -; CHECK-NEXT: vld1.32 {d16, d17}, [r1] -; CHECK-NEXT: add r1, r0, #32 -; CHECK-NEXT: add r0, r0, #64 -; CHECK-NEXT: vld1.32 {d18, d19}, [r1] -; CHECK-NEXT: vadd.f32 q8, q9, q8 +; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f32 q8, q8, q9 ; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: vadd.f32 q0, q8, q9 ; CHECK-NEXT: bx lr @@ -130,12 +124,10 @@ define <4 x float> @test_negative_initial_offset(float* %A) { ; CHECK-LABEL: test_negative_initial_offset: ; CHECK: @ %bb.0: -; CHECK-NEXT: sub r1, r0, #48 -; CHECK-NEXT: vld1.32 {d16, d17}, [r1] -; CHECK-NEXT: sub r1, r0, #64 -; CHECK-NEXT: sub r0, r0, #32 -; CHECK-NEXT: vld1.32 {d18, d19}, [r1] -; CHECK-NEXT: vadd.f32 q8, q9, q8 +; CHECK-NEXT: sub r0, r0, #64 +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f32 q8, q8, q9 ; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: vadd.f32 q0, q8, q9 ; CHECK-NEXT: bx lr @@ -159,12 +151,10 @@ ; CHECK: @ %bb.0: ; CHECK-NEXT: movw r0, :lower16:global_float_array ; CHECK-NEXT: movt r0, :upper16:global_float_array -; CHECK-NEXT: add r1, r0, #48 -; CHECK-NEXT: vld1.32 {d16, d17}, [r1] -; CHECK-NEXT: add r1, r0, #32 -; CHECK-NEXT: add r0, r0, #64 -; CHECK-NEXT: vld1.32 {d18, d19}, [r1] -; CHECK-NEXT: vadd.f32 q8, q9, q8 +; CHECK-NEXT: add r0, r0, #32 +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f32 q8, q8, q9 ; CHECK-NEXT: vld1.32 {d18, d19}, [r0] ; CHECK-NEXT: vadd.f32 q0, q8, q9 ; CHECK-NEXT: bx lr @@ -190,11 +180,10 @@ ; CHECK-NEXT: mov r4, sp ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl external_function -; CHECK-NEXT: orr r0, r4, #32 ; CHECK-NEXT: vld1.32 {d16, d17}, [r4:128]! -; CHECK-NEXT: vld1.64 {d18, d19}, [r4:128] +; CHECK-NEXT: vld1.32 {d18, d19}, [r4:128]! ; CHECK-NEXT: vadd.f32 q8, q8, q9 -; CHECK-NEXT: vld1.64 {d18, d19}, [r0:128] +; CHECK-NEXT: vld1.32 {d18, d19}, [r4] ; CHECK-NEXT: vadd.f32 q0, q8, q9 ; CHECK-NEXT: sub sp, r11, #8 ; CHECK-NEXT: pop {r4, r10, r11, pc} @@ -217,13 +206,11 @@ define <2 x double> @test_double(double* %A) { ; CHECK-LABEL: test_double: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r1, r0, #80 -; CHECK-NEXT: vld1.64 {d16, d17}, [r1] -; CHECK-NEXT: add r1, r0, #64 -; CHECK-NEXT: add r0, r0, #96 -; CHECK-NEXT: vld1.64 {d18, d19}, [r1] -; CHECK-NEXT: vadd.f64 d20, d19, d17 -; CHECK-NEXT: vadd.f64 d16, d18, d16 +; CHECK-NEXT: add r0, r0, #64 +; CHECK-NEXT: vld1.64 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.64 {d18, d19}, [r0]! +; CHECK-NEXT: vadd.f64 d20, d17, d19 +; CHECK-NEXT: vadd.f64 d16, d16, d18 ; CHECK-NEXT: vld1.64 {d22, d23}, [r0] ; CHECK-NEXT: vadd.f64 d1, d20, d23 ; CHECK-NEXT: vadd.f64 d0, d16, d22 @@ -245,10 +232,8 @@ define void @test_various_instructions(float* %A) { ; CHECK-LABEL: test_various_instructions: ; CHECK: @ %bb.0: -; CHECK-NEXT: add r2, r0, #16 -; CHECK-NEXT: mov r1, #32 -; CHECK-NEXT: vld1.32 {d16, d17}, [r0], r1 -; CHECK-NEXT: vld1.32 {d18, d19}, [r2] +; CHECK-NEXT: vld1.32 {d16, d17}, [r0]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r0]! ; CHECK-NEXT: vadd.f32 q8, q8, q9 ; CHECK-NEXT: vst1.32 {d16, d17}, [r0] ; CHECK-NEXT: bx lr @@ -267,36 +252,27 @@ define void @test_lsr_geps(float* %a, float* %b, i32 %n) { ; CHECK-LABEL: test_lsr_geps: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r11, lr} -; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: blt .LBB10_3 -; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r3, #0 -; CHECK-NEXT: mov r12, #48 +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB10_1: @ %for.body.preheader +; CHECK-NEXT: mov r12, #0 ; CHECK-NEXT: .LBB10_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add lr, r0, r3 +; CHECK-NEXT: add r3, r0, r12 ; CHECK-NEXT: subs r2, r2, #1 -; CHECK-NEXT: mov r4, lr -; CHECK-NEXT: vld1.32 {d16, d17}, [r4], r12 -; CHECK-NEXT: vld1.32 {d18, d19}, [r4] -; CHECK-NEXT: add r4, lr, #32 -; CHECK-NEXT: vld1.32 {d20, d21}, [r4] -; CHECK-NEXT: add r4, lr, #16 -; CHECK-NEXT: vld1.32 {d22, d23}, [r4] -; CHECK-NEXT: add r4, r1, r3 -; CHECK-NEXT: add r5, r4, #16 -; CHECK-NEXT: add r3, r3, #64 -; CHECK-NEXT: mov lr, r4 -; CHECK-NEXT: add r4, r4, #32 -; CHECK-NEXT: vst1.32 {d16, d17}, [lr], r12 -; CHECK-NEXT: vst1.32 {d22, d23}, [r5] -; CHECK-NEXT: vst1.32 {d20, d21}, [r4] -; CHECK-NEXT: vst1.32 {d18, d19}, [lr] +; CHECK-NEXT: vld1.32 {d16, d17}, [r3]! +; CHECK-NEXT: vld1.32 {d18, d19}, [r3]! +; CHECK-NEXT: vld1.32 {d20, d21}, [r3]! +; CHECK-NEXT: vld1.32 {d22, d23}, [r3] +; CHECK-NEXT: add r3, r1, r12 +; CHECK-NEXT: add r12, r12, #64 +; CHECK-NEXT: vst1.32 {d16, d17}, [r3]! +; CHECK-NEXT: vst1.32 {d18, d19}, [r3]! +; CHECK-NEXT: vst1.32 {d20, d21}, [r3]! +; CHECK-NEXT: vst1.32 {d22, d23}, [r3] ; CHECK-NEXT: bne .LBB10_2 -; CHECK-NEXT: .LBB10_3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r4, r5, r11, pc} +; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup +; CHECK-NEXT: bx lr entry: %cmp61 = icmp sgt i32 %n, 0 br i1 %cmp61, label %for.body.preheader, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/ARM/misched-fusion-aes.ll b/llvm/test/CodeGen/ARM/misched-fusion-aes.ll --- a/llvm/test/CodeGen/ARM/misched-fusion-aes.ll +++ b/llvm/test/CodeGen/ARM/misched-fusion-aes.ll @@ -76,9 +76,9 @@ ; CHECK: aese.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QB]] -; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QC]] +; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QD]] @@ -86,6 +86,7 @@ ; CHECK: aese.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QE]] +; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aese.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QF]] @@ -93,8 +94,6 @@ ; CHECK: aese.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QG]] -; CHECK: aese.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} - ; CHECK: aese.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesmc.8 {{q[0-9][0-9]?}}, [[QH]] } @@ -170,9 +169,9 @@ ; CHECK: aesd.8 [[QB:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QB]] -; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QC:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QC]] +; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QD:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QD]] @@ -180,6 +179,7 @@ ; CHECK: aesd.8 [[QE:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QE]] +; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QF:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QF]] @@ -187,7 +187,6 @@ ; CHECK: aesd.8 [[QG:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QG]] -; CHECK: aesd.8 {{q[0-9][0-9]?}}, {{q[0-9][0-9]?}} ; CHECK: aesd.8 [[QH:q[0-9][0-9]?]], {{q[0-9][0-9]?}} ; CHECK-NEXT: aesimc.8 {{q[0-9][0-9]?}}, [[QH]] } diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll --- a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll @@ -198,21 +198,13 @@ ; @testNeon is an important example of the nead for ivchains. ; -; Currently we have two extra add.w's that keep the store address -; live past the next increment because ISEL is unfortunately undoing -; the store chain. ISEL also fails to convert all but one of the stores to -; post-increment addressing. However, the loads should use -; post-increment addressing, no add's or add.w's beyond the three -; mentioned. Most importantly, there should be no spills or reloads! +; Loads and stores should use post-increment addressing, no add's or add.w's. +; Most importantly, there should be no spills or reloads! ; ; A9: testNeon: ; A9: %.lr.ph -; A9: add.w r ; A9-NOT: lsl.w ; A9-NOT: {{ldr|str|adds|add r}} -; A9: vst1.8 {{.*}} [r{{[0-9]+}}], r{{[0-9]+}} -; A9: add.w r -; A9-NOT: {{ldr|str|adds|add r}} ; A9-NOT: add.w r ; A9: bne define hidden void @testNeon(i8* %ref_data, i32 %ref_stride, i32 %limit, <16 x i8>* nocapture %data) nounwind optsize {