Index: lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- lib/Target/ARM/ARMParallelDSP.cpp +++ lib/Target/ARM/ARMParallelDSP.cpp @@ -63,21 +63,16 @@ Instruction *Root; ValueList AllValues; MemInstList VecLd; // List of all load instructions. - MemLocList MemLocs; // All memory locations read by this tree. + MemInstList Loads; bool ReadOnly = true; OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { } virtual ~OpChain() = default; - void SetMemoryLocations() { - const auto Size = LocationSize::unknown(); + void PopulateLoads() { for (auto *V : AllValues) { - if (auto *I = dyn_cast(V)) { - if (I->mayWriteToMemory()) - ReadOnly = false; - if (auto *Ld = dyn_cast(V)) - MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size)); - } + if (auto *Ld = dyn_cast(V)) + Loads.push_back(Ld); } } @@ -140,12 +135,11 @@ std::map LoadPairs; std::map> WideLoads; - bool RecordSequentialLoads(BasicBlock *BB); + bool RecordMemoryOps(BasicBlock *BB); bool InsertParallelMACs(Reduction &Reduction); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); - LoadInst* CreateLoadIns(IRBuilder &IRB, - SmallVectorImpl &Loads, - IntegerType *LoadTy); + LoadInst* CreateWideLoad(SmallVectorImpl &Loads, + IntegerType *LoadTy); void CreateParallelMACPairs(Reduction &R); Instruction *CreateSMLADCall(SmallVectorImpl &VecLd0, SmallVectorImpl &VecLd1, @@ -164,6 +158,12 @@ ARMParallelDSP() : LoopPass(ID) { } + bool doInitialization(Loop *L, LPPassManager &LPM) override { + LoadPairs.clear(); + WideLoads.clear(); + return true; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { LoopPass::getAnalysisUsage(AU); AU.addRequired(); @@ -228,7 +228,7 @@ if (!ST->isLittle()) { LLVM_DEBUG(dbgs() << "Only supporting little endian: not running pass " - "ARMParallelDSP\n"); + << "ARMParallelDSP\n"); return false; } @@ -237,7 +237,7 @@ LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); - if (!RecordSequentialLoads(Header)) { + if (!RecordMemoryOps(Header)) { LLVM_DEBUG(dbgs() << " - No sequential loads found.\n"); return false; } @@ -314,11 +314,18 @@ return true; } -/// Iterate through the block and record base, offset pairs of loads as well as -/// maximal sequences of sequential loads. -bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) { +/// Iterate through the block and record base, offset pairs of loads which can +/// be widened into a single load. +bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) { SmallVector Loads; + SmallVector Writes; + + // Collect loads and instruction that may write to memory. For now we only + // record loads which are simple, sign-extended and have a single user. + // TODO: Allow zero-extended loads. for (auto &I : *BB) { + if (I.mayWriteToMemory()) + Writes.push_back(&I); auto *Ld = dyn_cast(&I); if (!Ld || !Ld->isSimple() || !Ld->hasOneUse() || !isa(Ld->user_back())) @@ -326,13 +333,57 @@ Loads.push_back(Ld); } - for (auto *Ld0 : Loads) { - for (auto *Ld1 : Loads) { - if (Ld0 == Ld1) + using InstSet = std::set; + using DepMap = std::map; + DepMap RAWDeps; + + // Record any writes that may alias a load. + const auto Size = LocationSize::unknown(); + for (auto Read : Loads) { + for (auto Write : Writes) { + if (Read == Write) + continue; + + MemoryLocation ReadLoc = + MemoryLocation(Read->getPointerOperand(), Size); + + if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc), + ModRefInfo::ModRef))) + continue; + if (DT->dominates(Write, Read)) + RAWDeps[Read].insert(Write); + } + } + + // Check whether there's not a write between the two loads which would + // prevent them from being safely merged. + auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) { + LoadInst *Dominator = DT->dominates(Base, Offset) ? Base : Offset; + LoadInst *Dominated = DT->dominates(Base, Offset) ? Offset : Base; + + if (RAWDeps.count(Dominated)) { + InstSet &WritesBefore = RAWDeps[Dominated]; + + for (auto Before : WritesBefore) { + + // We can't move the second load backward, past a write, to merge + // with the first load. + if (DT->dominates(Dominator, Before)) + return false; + } + } + return true; + }; + + // Record base, offset load pairs. + for (auto *Base : Loads) { + for (auto *Offset : Loads) { + if (Base == Offset) continue; - if (AreSequentialAccesses(Ld0, Ld1, *DL, *SE)) { - LoadPairs[Ld0] = Ld1; + if (AreSequentialAccesses(Base, Offset, *DL, *SE) && + SafeToPair(Base, Offset)) { + LoadPairs[Base] = Offset; break; } } @@ -442,9 +493,9 @@ for (auto &Pair : Reduction.PMACPairs) { BinOpChain *PMul0 = Pair.first; BinOpChain *PMul1 = Pair.second; - LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n"; - dbgs() << "- "; PMul0->Root->dump(); - dbgs() << "- "; PMul1->Root->dump()); + LLVM_DEBUG(dbgs() << "Found parallel MACs:\n" + << "- " << *PMul0->Root << "\n" + << "- " << *PMul1->Root << "\n"); Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange, InsertAfter); @@ -459,54 +510,6 @@ return false; } -static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header, - ReductionList &Reductions) { - RecurrenceDescriptor RecDesc; - const bool HasFnNoNaNAttr = - F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; - const BasicBlock *Latch = TheLoop->getLoopLatch(); - - for (PHINode &Phi : Header->phis()) { - const auto *Ty = Phi.getType(); - if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) - continue; - - const bool IsReduction = - RecurrenceDescriptor::AddReductionVar(&Phi, - RecurrenceDescriptor::RK_IntegerAdd, - TheLoop, HasFnNoNaNAttr, RecDesc); - if (!IsReduction) - continue; - - Instruction *Acc = dyn_cast(Phi.getIncomingValueForBlock(Latch)); - if (!Acc) - continue; - - Reductions.push_back(Reduction(&Phi, Acc)); - } - - LLVM_DEBUG( - dbgs() << "\nAccumulating integer additions (reductions) found:\n"; - for (auto &R : Reductions) { - dbgs() << "- "; R.Phi->dump(); - dbgs() << "-> "; R.AccIntAdd->dump(); - } - ); -} - -static void AddMACCandidate(OpChainList &Candidates, - Instruction *Mul, - Value *MulOp0, Value *MulOp1) { - assert(Mul->getOpcode() == Instruction::Mul && - "expected mul instruction"); - ValueList LHS; - ValueList RHS; - if (IsNarrowSequence<16>(MulOp0, LHS) && - IsNarrowSequence<16>(MulOp1, RHS)) { - Candidates.push_back(make_unique(Mul, LHS, RHS)); - } -} - static void MatchParallelMACSequences(Reduction &R, OpChainList &Candidates) { Instruction *Acc = R.AccIntAdd; @@ -528,8 +531,14 @@ case Instruction::Mul: { Value *MulOp0 = I->getOperand(0); Value *MulOp1 = I->getOperand(1); - if (isa(MulOp0) && isa(MulOp1)) - AddMACCandidate(Candidates, I, MulOp0, MulOp1); + if (isa(MulOp0) && isa(MulOp1)) { + ValueList LHS; + ValueList RHS; + if (IsNarrowSequence<16>(MulOp0, LHS) && + IsNarrowSequence<16>(MulOp1, RHS)) { + Candidates.push_back(make_unique(I, LHS, RHS)); + } + } return false; } case Instruction::SExt: @@ -543,52 +552,6 @@ << Candidates.size() << " candidates.\n"); } -// Collects all instructions that are not part of the MAC chains, which is the -// set of instructions that can potentially alias with the MAC operands. -static void AliasCandidates(BasicBlock *Header, Instructions &Reads, - Instructions &Writes) { - for (auto &I : *Header) { - if (I.mayReadFromMemory()) - Reads.push_back(&I); - if (I.mayWriteToMemory()) - Writes.push_back(&I); - } -} - -// Check whether statements in the basic block that write to memory alias with -// the memory locations accessed by the MAC-chains. -// TODO: we need the read statements when we accept more complicated chains. -static bool AreAliased(AliasAnalysis *AA, Instructions &Reads, - Instructions &Writes, OpChainList &MACCandidates) { - LLVM_DEBUG(dbgs() << "Alias checks:\n"); - for (auto &MAC : MACCandidates) { - LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump()); - - // At the moment, we allow only simple chains that only consist of reads, - // accumulate their result with an integer add, and thus that don't write - // memory, and simply bail if they do. - if (!MAC->ReadOnly) - return true; - - // Now for all writes in the basic block, check that they don't alias with - // the memory locations accessed by our MAC-chain: - for (auto *I : Writes) { - LLVM_DEBUG(dbgs() << "- "; I->dump()); - assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs"); - for (auto &MemLoc : MAC->MemLocs) { - if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc), - ModRefInfo::ModRef))) { - LLVM_DEBUG(dbgs() << "Yes, aliases found\n"); - return true; - } - } - } - } - - LLVM_DEBUG(dbgs() << "OK: no aliases found!\n"); - return false; -} - static bool CheckMACMemory(OpChainList &Candidates) { for (auto &C : Candidates) { // A mul has 2 operands, and a narrow op consist of sext and a load; thus @@ -597,7 +560,7 @@ LLVM_DEBUG(dbgs() << "Operand list too short.\n"); return false; } - C->SetMemoryLocations(); + C->PopulateLoads(); ValueList &LHS = static_cast(C.get())->LHS; ValueList &RHS = static_cast(C.get())->RHS; @@ -643,14 +606,36 @@ // before the loop begins. // bool ARMParallelDSP::MatchSMLAD(Function &F) { - BasicBlock *Header = L->getHeader(); - LLVM_DEBUG(dbgs() << "= Matching SMLAD =\n"; - dbgs() << "Header block:\n"; Header->dump(); - dbgs() << "Loop info:\n\n"; L->dump()); - bool Changed = false; + auto FindReductions = [&](ReductionList &Reductions) { + RecurrenceDescriptor RecDesc; + const bool HasFnNoNaNAttr = + F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true"; + BasicBlock *Latch = L->getLoopLatch(); + + for (PHINode &Phi : Latch->phis()) { + const auto *Ty = Phi.getType(); + if (!Ty->isIntegerTy(32) && !Ty->isIntegerTy(64)) + continue; + + const bool IsReduction = RecurrenceDescriptor::AddReductionVar( + &Phi, RecurrenceDescriptor::RK_IntegerAdd, L, HasFnNoNaNAttr, RecDesc); + + if (!IsReduction) + continue; + + Instruction *Acc = dyn_cast(Phi.getIncomingValueForBlock(Latch)); + if (!Acc) + continue; + + Reductions.push_back(Reduction(&Phi, Acc)); + } + return !Reductions.empty(); + }; + ReductionList Reductions; - MatchReductions(F, L, Header, Reductions); + if (!FindReductions(Reductions)) + return false; for (auto &R : Reductions) { OpChainList MACCandidates; @@ -666,72 +651,79 @@ dbgs() << "\n";); } - // Collect all instructions that may read or write memory. Our alias - // analysis checks bail out if any of these instructions aliases with an - // instruction from the MAC-chain. - Instructions Reads, Writes; - AliasCandidates(Header, Reads, Writes); - + bool Changed = false; + // Check whether statements in the basic block that write to memory alias + // with the memory locations accessed by the MAC-chains. for (auto &R : Reductions) { - if (AreAliased(AA, Reads, Writes, R.MACCandidates)) - return false; CreateParallelMACPairs(R); Changed |= InsertParallelMACs(R); } - LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump();); return Changed; } -LoadInst* ARMParallelDSP::CreateLoadIns(IRBuilder &IRB, - SmallVectorImpl &Loads, - IntegerType *LoadTy) { +LoadInst* ARMParallelDSP::CreateWideLoad(SmallVectorImpl &Loads, + IntegerType *LoadTy) { assert(Loads.size() == 2 && "currently only support widening two loads"); - - const unsigned AddrSpace = Loads[0]->getPointerAddressSpace(); - Value *VecPtr = IRB.CreateBitCast(Loads[0]->getPointerOperand(), + + LoadInst *Base = Loads[0]; + LoadInst *Offset = Loads[1]; + + Instruction *BaseSExt = dyn_cast(Base->user_back()); + Instruction *OffsetSExt = dyn_cast(Offset->user_back()); + + assert((Base->hasOneUse() && Offset->hasOneUse() && BaseSExt && OffsetSExt) + && "Loads should have a single, extending, user"); + + std::function MoveBefore = + [&](Value *A, Value *B) -> void { + if (!isa(A) || !isa(B)) + return; + + auto *Source = cast(A); + auto *Sink = cast(B); + + if (DT->dominates(Source, Sink) || + Source->getParent() != Sink->getParent() || + isa(Source) || isa(Sink)) + return; + + Source->moveBefore(Sink); + for (auto &U : Source->uses()) + MoveBefore(Source, U.getUser()); + }; + + // Insert the load at the point of the original dominating load. + LoadInst *DomLoad = DT->dominates(Base, Offset) ? Base : Offset; + IRBuilder IRB(DomLoad->getParent(), + ++BasicBlock::iterator(DomLoad)); + + // Bitcast the pointer to a wider type and create the wide load, while making + // sure to maintain the original alignment as this prevents ldrd from being + // generated when it could be illegal due to memory alignment. + const unsigned AddrSpace = DomLoad->getPointerAddressSpace(); + Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(), LoadTy->getPointerTo(AddrSpace)); LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, - Loads[0]->getAlignment()); - // Fix up users, Loads[0] needs trunc while Loads[1] needs a lshr and trunc. - Instruction *SExt0 = dyn_cast(Loads[0]->user_back()); - Instruction *SExt1 = dyn_cast(Loads[1]->user_back()); - - assert((Loads[0]->hasOneUse() && Loads[1]->hasOneUse() && SExt0 && SExt1) && - "Loads should have a single, extending, user"); - - std::function MoveAfter = - [&](Instruction* Source, Instruction* Sink) -> void { - if (DT->dominates(Source, Sink) || - Source->getParent() != Sink->getParent() || - isa(Source) || isa(Sink)) - return; - - Sink->moveAfter(Source); - for (auto &U : Sink->uses()) - MoveAfter(Sink, cast(U.getUser())); - }; + Base->getAlignment()); + + // Make sure everything is in the correct order in the basic block. + MoveBefore(Base->getPointerOperand(), VecPtr); + MoveBefore(VecPtr, WideLoad); // From the wide load, create two values that equal the original two loads. - Value *Bottom = IRB.CreateTrunc(WideLoad, Loads[0]->getType()); - SExt0->setOperand(0, Bottom); - if (auto *I = dyn_cast(Bottom)) { - I->moveAfter(WideLoad); - MoveAfter(I, SExt0); - } + // Loads[0] needs trunc while Loads[1] needs a lshr and trunc. + // TODO: Support big-endian as well. + Value *Bottom = IRB.CreateTrunc(WideLoad, Base->getType()); + BaseSExt->setOperand(0, Bottom); - IntegerType *Ld1Ty = cast(Loads[1]->getType()); - Value *ShiftVal = ConstantInt::get(LoadTy, Ld1Ty->getBitWidth()); + IntegerType *OffsetTy = cast(Offset->getType()); + Value *ShiftVal = ConstantInt::get(LoadTy, OffsetTy->getBitWidth()); Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); - if (auto *I = dyn_cast(Top)) - MoveAfter(WideLoad, I); + Value *Trunc = IRB.CreateTrunc(Top, OffsetTy); + OffsetSExt->setOperand(0, Trunc); - Value *Trunc = IRB.CreateTrunc(Top, Ld1Ty); - SExt1->setOperand(0, Trunc); - if (auto *I = dyn_cast(Trunc)) - MoveAfter(I, SExt1); - - WideLoads.emplace(std::make_pair(Loads[0], + WideLoads.emplace(std::make_pair(Base, make_unique(Loads, WideLoad))); return WideLoad; } @@ -748,15 +740,13 @@ << "- " << *Acc << "\n" << "- Exchange: " << Exchange << "\n"); - IRBuilder Builder(InsertAfter->getParent(), - ++BasicBlock::iterator(InsertAfter)); - // Replace the reduction chain with an intrinsic call IntegerType *Ty = IntegerType::get(M->getContext(), 32); LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ? - WideLoads[VecLd0[0]]->getLoad() : CreateLoadIns(Builder, VecLd0, Ty); + WideLoads[VecLd0[0]]->getLoad() : CreateWideLoad(VecLd0, Ty); LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ? - WideLoads[VecLd1[0]]->getLoad() : CreateLoadIns(Builder, VecLd1, Ty); + WideLoads[VecLd1[0]]->getLoad() : CreateWideLoad(VecLd1, Ty); + Value* Args[] = { WideLd0, WideLd1, Acc }; Function *SMLAD = nullptr; if (Exchange) @@ -767,6 +757,9 @@ SMLAD = Acc->getType()->isIntegerTy(32) ? Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) : Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); + + IRBuilder Builder(InsertAfter->getParent(), + ++BasicBlock::iterator(InsertAfter)); CallInst *Call = Builder.CreateCall(SMLAD, Args); NumSMLAD++; return Call; Index: test/CodeGen/ARM/ParallelDSP/aliasing.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/ParallelDSP/aliasing.ll @@ -0,0 +1,506 @@ +; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -verify -S | FileCheck %s +; +; Alias check: check that the rewrite isn't triggered when there's a store +; instruction possibly aliasing any mul load operands; arguments are passed +; without 'restrict' enabled. +; +; CHECK-NOT: call i32 @llvm.arm.smlad +; +define dso_local i32 @no_restrict(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + ret i32 %mac1.0.lcssa + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + +; Store inserted here, aliasing with arrayidx, arrayidx1, arrayidx3 + store i16 42, i16* %arrayidx, align 2 + + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; Alias check: check that the rewrite isn't triggered when there's a store +; aliasing one of the mul load operands. Arguments are now annotated with +; 'noalias'. +; +; CHECK-NOT: call i32 @llvm.arm.smlad +; +define dso_local i32 @restrict(i32 %arg, i32* noalias %arg1, i16* noalias readonly %arg2, i16* noalias readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + ret i32 %mac1.0.lcssa + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + +; Store inserted here, aliasing only with loads from 'arrayidx'. + store i16 42, i16* %arrayidx, align 2 + + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + +; Here the Mul is the LHS, and the Add the RHS. + %add11 = add i32 %mul9, %add10 + + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: store_dominates_all +; CHECK: store +; CHECK: load +; CHECK: load +; CHECK: load +; CHECK: load +; CHECK: smlad +define dso_local i32 @store_dominates_all(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + ret i32 %mac1.0.lcssa + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + store i16 42, i16* %arrayidx, align 2 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: loads_dominate +; CHECK-NOT: store +; CHECK: load i32 +; CHECK-NOT: store +; CHECK: load i32 +; CHECK: store +define dso_local i32 @loads_dominate(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + ret i32 %mac1.0.lcssa + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + store i16 42, i16* %arrayidx, align 2 + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: store_alias_arg3_legal_1 +; CHECK-NOT: store +; CHECK: phi i32 +; CHECK: [[IV:%[^ ]+]] = phi i32 [ %add +; CHECK: [[ARG3_GEP:%[^ ]+]] = getelementptr inbounds i16, i16* %arg3, i32 [[IV]] +; CHECK: [[ARG3:%[^ ]+]] = bitcast i16* [[ARG3_GEP]] to i32* +; CHECK: load i32, i32* [[ARG3]] +; CHECK: [[ARG2_GEP:%[^ ]+]] = getelementptr inbounds i16, i16* %arg2, i32 [[IV]] +; CHECK: [[ARG2:%[^ ]+]] = bitcast i16* [[ARG2_GEP]] to i32* +; CHECK: load i32, i32* [[ARG2]] +; CHECK: store +define dso_local i32 @store_alias_arg3_legal_1(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + ret i32 %mac1.0.lcssa + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + store i16 42, i16* %arrayidx, align 2 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: store_alias_arg3_legal_2 +; CHECK-NOT: store +; CHECK: [[BITCAST:[^ ]+]] = bitcast i16* %arrayidx to i32* +; CHECK: load i32, i32* [[BITCAST]] +; CHECK: store i16 42, i16* %arrayidx +; CHECK: [[BITCAST3:[^ ]+]] = bitcast i16* %arrayidx3 to i32* +; CHECK: load i32, i32* [[BITCAST3]] +; CHECK: smlad +define dso_local i32 @store_alias_arg3_legal_2(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + ret i32 %mac1.0.lcssa + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + store i16 42, i16* %arrayidx, align 2 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: store_alias_arg3_illegal_1 +; CHECK-NOT: load i32 +define dso_local i32 @store_alias_arg3_illegal_1(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* noalias nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + ret i32 %mac1.0.lcssa + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + store i16 42, i16* %arrayidx1, align 2 + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: store_alias_arg3_illegal_2 +; CHECK-NOT: load i32 +define dso_local i32 @store_alias_arg3_illegal_2(i32 %arg, i32* nocapture %arg1, i16* noalias nocapture readonly %arg2, i16* noalias nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + ret i32 %mac1.0.lcssa + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + store i16 42, i16* %arrayidx, align 2 + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: store_alias_arg2_illegal_1 +; CHECK-NOT: load i32 +define dso_local i32 @store_alias_arg2_illegal_1(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + ret i32 %mac1.0.lcssa + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + store i16 42, i16* %arrayidx6, align 2 + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: store_alias_arg2_illegal_2 +; CHECK-NOT: load i32 +define dso_local i32 @store_alias_arg2_illegal_2(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + ret i32 %mac1.0.lcssa + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + store i16 42, i16* %arrayidx3, align 2 + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: one_pair_alias +; FIXME: This tests shows we have a bug with smlad insertion +define i32 @one_pair_alias(i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add26 + +for.body: ; preds = %for.body, %entry + %i.050 = phi i32 [ 0, %entry ], [ %add27, %for.body ] + %a.049 = phi i32 [ 0, %entry ], [ %add26, %for.body ] + %add3 = or i32 %i.050, 1 + %add11 = or i32 %i.050, 2 + %add19 = or i32 %i.050, 3 + %arrayidx = getelementptr inbounds i16, i16* %b, i32 %i.050 + %arrayidx4 = getelementptr inbounds i16, i16* %b, i32 %add3 + %arrayidx12 = getelementptr inbounds i16, i16* %b, i32 %add11 + %arrayidx20 = getelementptr inbounds i16, i16* %b, i32 %add19 + %arrayidx1 = getelementptr inbounds i16, i16* %c, i32 %i.050 + %arrayidx7 = getelementptr inbounds i16, i16* %c, i32 %add3 + %arrayidx15 = getelementptr inbounds i16, i16* %c, i32 %add11 + %arrayidx23 = getelementptr inbounds i16, i16* %c, i32 %add19 + %tmp = load i16, i16* %arrayidx, align 2 + %tmp2 = load i16, i16* %arrayidx4, align 2 + %tmp4 = load i16, i16* %arrayidx12, align 2 + %tmp6 = load i16, i16* %arrayidx20, align 2 + %tmp1 = load i16, i16* %arrayidx1, align 2 + store i16 43, i16 *%arrayidx7 + %tmp3 = load i16, i16* %arrayidx7, align 2 + %tmp5 = load i16, i16* %arrayidx15, align 2 + %tmp7 = load i16, i16* %arrayidx23, align 2 + %conv = sext i16 %tmp to i32 + %conv2 = sext i16 %tmp1 to i32 + %mul = mul nsw i32 %conv2, %conv + %add = add nsw i32 %mul, %a.049 + %conv5 = sext i16 %tmp2 to i32 + %conv8 = sext i16 %tmp3 to i32 + %mul9 = mul nsw i32 %conv8, %conv5 + %add10 = add nsw i32 %add, %mul9 + %conv13 = sext i16 %tmp4 to i32 + %conv16 = sext i16 %tmp5 to i32 + %mul17 = mul nsw i32 %conv16, %conv13 + %add18 = add nsw i32 %add10, %mul17 + %conv21 = sext i16 %tmp6 to i32 + %conv24 = sext i16 %tmp7 to i32 + %mul25 = mul nsw i32 %conv24, %conv21 + %add26 = add nsw i32 %add18, %mul25 + %add27 = add nuw nsw i32 %i.050, 4 + %cmp = icmp ult i32 %add27, 100 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + Index: test/CodeGen/ARM/ParallelDSP/smlad0.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlad0.ll +++ test/CodeGen/ARM/ParallelDSP/smlad0.ll @@ -11,11 +11,11 @@ ; ; CHECK-LABEL: @OneReduction ; CHECK: %mac1{{\.}}026 = phi i32 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ] -; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* +; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx to i32* ; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2 -; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32* +; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* ; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2 -; CHECK: [[V8]] = call i32 @llvm.arm.smlad(i32 [[V5]], i32 [[V7]], i32 %mac1{{\.}}026) +; CHECK: [[V8]] = call i32 @llvm.arm.smlad(i32 [[V7]], i32 [[V5]], i32 %mac1{{\.}}026) ; CHECK-NOT: call i32 @llvm.arm.smlad ; ; CHECK-UNSUPPORTED-NOT: call i32 @llvm.arm.smlad Index: test/CodeGen/ARM/ParallelDSP/smlad1.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlad1.ll +++ test/CodeGen/ARM/ParallelDSP/smlad1.ll @@ -2,11 +2,11 @@ ; CHECK-LABEL: @test1 ; CHECK: %mac1{{\.}}026 = phi i32 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ] -; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* +; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx to i32* ; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2 -; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32* +; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* ; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2 -; CHECK: [[V8]] = call i32 @llvm.arm.smlad(i32 [[V5]], i32 [[V7]], i32 %mac1{{\.}}026) +; CHECK: [[V8]] = call i32 @llvm.arm.smlad(i32 [[V7]], i32 [[V5]], i32 %mac1{{\.}}026) define dso_local i32 @test1(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { entry: Index: test/CodeGen/ARM/ParallelDSP/smlad11.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlad11.ll +++ test/CodeGen/ARM/ParallelDSP/smlad11.ll @@ -4,15 +4,15 @@ ; A more complicated chain: 4 mul operations, so we expect 2 smlad calls. ; ; CHECK: %mac1{{\.}}054 = phi i32 [ [[V17:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ] -; CHECK: [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32* -; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2 ; CHECK: [[V10:%[0-9]+]] = bitcast i16* %arrayidx to i32* ; CHECK: [[V11:%[0-9]+]] = load i32, i32* [[V10]], align 2 -; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054) -; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32* -; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2 ; CHECK: [[V15:%[0-9]+]] = bitcast i16* %arrayidx4 to i32* ; CHECK: [[V16:%[0-9]+]] = load i32, i32* [[V15]], align 2 +; CHECK: [[V8:%[0-9]+]] = bitcast i16* %arrayidx8 to i32* +; CHECK: [[V9:%[0-9]+]] = load i32, i32* [[V8]], align 2 +; CHECK: [[V13:%[0-9]+]] = bitcast i16* %arrayidx17 to i32* +; CHECK: [[V14:%[0-9]+]] = load i32, i32* [[V13]], align 2 +; CHECK: [[V12:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V9]], i32 [[V11]], i32 %mac1{{\.}}054) ; CHECK: [[V17:%[0-9]+]] = call i32 @llvm.arm.smlad(i32 [[V14]], i32 [[V16]], i32 [[V12]]) ; ; And we don't want to see a 3rd smlad: Index: test/CodeGen/ARM/ParallelDSP/smlad6.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlad6.ll +++ /dev/null @@ -1,50 +0,0 @@ -; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s -; -; Alias check: check that the rewrite isn't triggered when there's a store -; instruction possibly aliasing any mul load operands; arguments are passed -; without 'restrict' enabled. -; -; CHECK-NOT: call i32 @llvm.arm.smlad -; -define dso_local i32 @test(i32 %arg, i32* nocapture %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { -entry: - %cmp24 = icmp sgt i32 %arg, 0 - br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %.pre = load i16, i16* %arg3, align 2 - %.pre27 = load i16, i16* %arg2, align 2 - br label %for.body - -for.cond.cleanup: - %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] - ret i32 %mac1.0.lcssa - -for.body: - %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] - %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 - %0 = load i16, i16* %arrayidx, align 2 - -; Store inserted here, aliasing with arrayidx, arrayidx1, arrayidx3 - store i16 42, i16* %arrayidx, align 2 - - %add = add nuw nsw i32 %i.025, 1 - %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add - %1 = load i16, i16* %arrayidx1, align 2 - %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 - %2 = load i16, i16* %arrayidx3, align 2 - %conv = sext i16 %2 to i32 - %conv4 = sext i16 %0 to i32 - %mul = mul nsw i32 %conv, %conv4 - %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add - %3 = load i16, i16* %arrayidx6, align 2 - %conv7 = sext i16 %3 to i32 - %conv8 = sext i16 %1 to i32 - %mul9 = mul nsw i32 %conv7, %conv8 - %add10 = add i32 %mul, %mac1.026 - %add11 = add i32 %mul9, %add10 - %exitcond = icmp ne i32 %add, %arg - br i1 %exitcond, label %for.body, label %for.cond.cleanup -} - Index: test/CodeGen/ARM/ParallelDSP/smlad7.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlad7.ll +++ /dev/null @@ -1,53 +0,0 @@ -; RUN: opt -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s -arm-parallel-dsp -S | FileCheck %s -; -; Alias check: check that the rewrite isn't triggered when there's a store -; aliasing one of the mul load operands. Arguments are now annotated with -; 'noalias'. -; -; CHECK-NOT: call i32 @llvm.arm.smlad -; -define dso_local i32 @test(i32 %arg, i32* noalias %arg1, i16* noalias readonly %arg2, i16* noalias readonly %arg3) { -entry: - %cmp24 = icmp sgt i32 %arg, 0 - br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %.pre = load i16, i16* %arg3, align 2 - %.pre27 = load i16, i16* %arg2, align 2 - br label %for.body - -for.cond.cleanup: - %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] - ret i32 %mac1.0.lcssa - -for.body: - %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] - %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] - %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 - %0 = load i16, i16* %arrayidx, align 2 - -; Store inserted here, aliasing only with loads from 'arrayidx'. - store i16 42, i16* %arrayidx, align 2 - - %add = add nuw nsw i32 %i.025, 1 - %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add - %1 = load i16, i16* %arrayidx1, align 2 - %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 - %2 = load i16, i16* %arrayidx3, align 2 - %conv = sext i16 %2 to i32 - %conv4 = sext i16 %0 to i32 - %mul = mul nsw i32 %conv, %conv4 - %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add - %3 = load i16, i16* %arrayidx6, align 2 - %conv7 = sext i16 %3 to i32 - %conv8 = sext i16 %1 to i32 - %mul9 = mul nsw i32 %conv7, %conv8 - %add10 = add i32 %mul, %mac1.026 - -; Here the Mul is the LHS, and the Add the RHS. - %add11 = add i32 %mul9, %add10 - - %exitcond = icmp ne i32 %add, %arg - br i1 %exitcond, label %for.body, label %for.cond.cleanup -} - Index: test/CodeGen/ARM/ParallelDSP/smladx-1.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smladx-1.ll +++ test/CodeGen/ARM/ParallelDSP/smladx-1.ll @@ -8,15 +8,15 @@ ; CHECK-LABEL: smladx ; CHECK: = phi i32 [ 0, %for.body.preheader.new ], ; CHECK: [[ACC0:%[^ ]+]] = phi i32 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ] +; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32* +; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 +; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* +; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 ; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN23]], i32 [[IN12]], i32 [[ACC0]]) -; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32* -; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 -; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* -; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 ; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN21]], i32 [[IN10]], i32 [[ACC1]]) ; CHECK-NOT: call i32 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i32 @llvm.arm.smlad @@ -124,20 +124,21 @@ ; CHECK: [[IV:%[^ ]+]] = phi i32 ; CHECK: [[ACC0:%[^ ]+]] = phi i32 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ] -; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 -; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 +; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32* +; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2 +; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 +; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* +; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* ; CHECK: [[IN2_2:%[^ ]+]] = load i32, i32* [[PIN2_2_CAST]], align 2 + ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]]) -; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32* -; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2 -; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* -; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[IN2_2]], i32 [[IN1]], i32 [[ACC0]]) ; CHECK: [[ACC2]] = call i32 @llvm.arm.smladx(i32 [[IN2]], i32 [[IN1_2]], i32 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 Index: test/CodeGen/ARM/ParallelDSP/smlald0.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlald0.ll +++ test/CodeGen/ARM/ParallelDSP/smlald0.ll @@ -11,10 +11,10 @@ ; ; CHECK-LABEL: @OneReduction ; CHECK: %mac1{{\.}}026 = phi i64 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ] -; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* -; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2 ; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32* ; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2 +; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* +; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2 ; CHECK: [[V8]] = call i64 @llvm.arm.smlald(i32 [[V5]], i32 [[V7]], i64 %mac1{{\.}}026) ; CHECK-NOT: call i64 @llvm.arm.smlald ; Index: test/CodeGen/ARM/ParallelDSP/smlald1.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlald1.ll +++ test/CodeGen/ARM/ParallelDSP/smlald1.ll @@ -2,10 +2,10 @@ ; CHECK-LABEL: @test1 ; CHECK: %mac1{{\.}}026 = phi i64 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ] -; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* -; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2 ; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32* ; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2 +; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* +; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2 ; CHECK: [[V8]] = call i64 @llvm.arm.smlald(i32 [[V5]], i32 [[V7]], i64 %mac1{{\.}}026) define dso_local i64 @test1(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { Index: test/CodeGen/ARM/ParallelDSP/smlald2.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlald2.ll +++ test/CodeGen/ARM/ParallelDSP/smlald2.ll @@ -10,10 +10,10 @@ ; ; CHECK-LABEL: @OneReduction ; CHECK: %mac1{{\.}}026 = phi i64 [ [[V8:%[0-9]+]], %for.body ], [ 0, %for.body.preheader ] -; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* -; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2 ; CHECK: [[V6:%[0-9]+]] = bitcast i16* %arrayidx to i32* ; CHECK: [[V7:%[0-9]+]] = load i32, i32* [[V6]], align 2 +; CHECK: [[V4:%[0-9]+]] = bitcast i16* %arrayidx3 to i32* +; CHECK: [[V5:%[0-9]+]] = load i32, i32* [[V4]], align 2 ; CHECK: [[V8]] = call i64 @llvm.arm.smlald(i32 [[V5]], i32 [[V7]], i64 %mac1{{\.}}026) ; CHECK-NOT: call i64 @llvm.arm.smlald ; Index: test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll +++ test/CodeGen/ARM/ParallelDSP/smlaldx-1.ll @@ -7,15 +7,15 @@ ; CHECK-LABEL: smlaldx ; CHECK: = phi i32 [ 0, %for.body.preheader.new ], ; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ] +; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32* +; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 +; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* +; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 ; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]]) -; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32* -; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 -; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* -; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 ; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]]) ; CHECK-NOT: call i64 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad @@ -181,19 +181,21 @@ ; CHECK: [[IV:%[^ ]+]] = phi i32 ; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ] +; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32* +; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2 + ; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 -; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 +; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* +; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* ; CHECK: [[IN2_2:%[^ ]+]] = load i32, i32* [[PIN2_2_CAST]], align 2 + ; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* ; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) -; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32* -; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2 -; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* -; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2_2]], i32 [[IN1]], i64 [[ACC0]]) ; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4 Index: test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll =================================================================== --- test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll +++ test/CodeGen/ARM/ParallelDSP/smlaldx-2.ll @@ -7,15 +7,15 @@ ; CHECK-LABEL: smlaldx ; CHECK: = phi i32 [ 0, %for.body.preheader.new ], ; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ] +; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32* +; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 +; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* +; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 ; CHECK: [[PIN23:%[^ ]+]] = bitcast i16* %pIn2.3 to i32* ; CHECK: [[IN23:%[^ ]+]] = load i32, i32* [[PIN23]], align 2 ; CHECK: [[PIN12:%[^ ]+]] = bitcast i16* %pIn1.2 to i32* ; CHECK: [[IN12:%[^ ]+]] = load i32, i32* [[PIN12]], align 2 ; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN23]], i32 [[IN12]], i64 [[ACC0]]) -; CHECK: [[PIN21:%[^ ]+]] = bitcast i16* %pIn2.1 to i32* -; CHECK: [[IN21:%[^ ]+]] = load i32, i32* [[PIN21]], align 2 -; CHECK: [[PIN10:%[^ ]+]] = bitcast i16* %pIn1.0 to i32* -; CHECK: [[IN10:%[^ ]+]] = load i32, i32* [[PIN10]], align 2 ; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN21]], i32 [[IN10]], i64 [[ACC1]]) ; CHECK-NOT: call i64 @llvm.arm.smlad ; CHECK-UNSUPPORTED-NOT: call i64 @llvm.arm.smlad @@ -180,19 +180,22 @@ ; CHECK: [[PIN1:%[^ ]+]] = phi i16* [ [[PIN1_NEXT:%[^ ]+]], %for.body ], [ [[PIN1Base]], %for.body.preheader.new ] ; CHECK: [[IV:%[^ ]+]] = phi i32 ; CHECK: [[ACC0:%[^ ]+]] = phi i64 [ 0, %for.body.preheader.new ], [ [[ACC2:%[^ ]+]], %for.body ] -; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 -; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_CAST:%[^ ]+]] = bitcast i16* [[PIN2]] to i32* ; CHECK: [[IN2:%[^ ]+]] = load i32, i32* [[PIN2_CAST]], align 2 + +; CHECK: [[PIN1_2:%[^ ]+]] = getelementptr i16, i16* [[PIN1]], i32 -2 ; CHECK: [[PIN1_2_CAST:%[^ ]+]] = bitcast i16* [[PIN1_2]] to i32* ; CHECK: [[IN1_2:%[^ ]+]] = load i32, i32* [[PIN1_2_CAST]], align 2 -; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]]) -; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* -; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 +; CHECK: [[PIN2_2:%[^ ]+]] = getelementptr i16, i16* [[PIN2]], i32 -2 ; CHECK: [[PIN2_2_CAST:%[^ ]+]] = bitcast i16* [[PIN2_2]] to i32* ; CHECK: [[IN2_2:%[^ ]+]] = load i32, i32* [[PIN2_2_CAST]], align 2 + +; CHECK: [[PIN1_CAST:%[^ ]+]] = bitcast i16* [[PIN1]] to i32* +; CHECK: [[IN1:%[^ ]+]] = load i32, i32* [[PIN1_CAST]], align 2 + +; CHECK: [[ACC1:%[^ ]+]] = call i64 @llvm.arm.smlaldx(i32 [[IN2]], i32 [[IN1_2]], i64 [[ACC0]]) ; CHECK: [[ACC2]] = call i64 @llvm.arm.smlaldx(i32 [[IN1]], i32 [[IN2_2]], i64 [[ACC1]]) ; CHECK: [[PIN1_NEXT]] = getelementptr i16, i16* [[PIN1]], i32 4