Index: llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp +++ llvm/trunk/lib/Target/ARM/ARMParallelDSP.cpp @@ -53,7 +53,7 @@ using OpChainList = SmallVector, 8>; using ReductionList = SmallVector; using ValueList = SmallVector; - using MemInstList = SmallVector; + using MemInstList = SmallVector; using PMACPair = std::pair; using PMACPairList = SmallVector; using Instructions = SmallVector; @@ -113,6 +113,21 @@ Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { }; }; + class WidenedLoad { + LoadInst *NewLd = nullptr; + SmallVector Loads; + + public: + WidenedLoad(SmallVectorImpl &Lds, LoadInst *Wide) + : NewLd(Wide) { + for (auto *I : Lds) + Loads.push_back(I); + } + LoadInst *getLoad() { + return NewLd; + } + }; + class ARMParallelDSP : public LoopPass { ScalarEvolution *SE; AliasAnalysis *AA; @@ -123,13 +138,17 @@ const DataLayout *DL; Module *M; std::map LoadPairs; - std::map> SequentialLoads; + std::map> WideLoads; - bool RecordSequentialLoads(BasicBlock *Header); + bool RecordSequentialLoads(BasicBlock *BB); bool InsertParallelMACs(Reduction &Reduction); bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem); + LoadInst* CreateLoadIns(IRBuilder &IRB, + SmallVectorImpl &Loads, + IntegerType *LoadTy); void CreateParallelMACPairs(Reduction &R); - Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, + Instruction *CreateSMLADCall(SmallVectorImpl &VecLd0, + SmallVectorImpl &VecLd1, Instruction *Acc, bool Exchange, Instruction *InsertAfter); @@ -202,7 +221,6 @@ } LoopAccessInfo LAI(L, SE, TLI, AA, DT, LI); - bool Changes = false; LLVM_DEBUG(dbgs() << "\n== Parallel DSP pass ==\n"); LLVM_DEBUG(dbgs() << " - " << F.getName() << "\n\n"); @@ -212,7 +230,7 @@ return false; } - Changes = MatchSMLAD(F); + bool Changes = MatchSMLAD(F); return Changes; } }; @@ -225,7 +243,6 @@ // why we check that types are equal to MaxBitWidth, and not <= MaxBitWidth. template static bool IsNarrowSequence(Value *V, ValueList &VL) { - LLVM_DEBUG(dbgs() << "Is narrow sequence? "; V->dump()); ConstantInt *CInt; if (match(V, m_ConstantInt(CInt))) { @@ -244,38 +261,25 @@ } else if (match(V, m_Add(m_Value(LHS), m_Value(RHS)))) { // TODO: we need to implement sadd16/sadd8 for this, which enables to // also do the rewrite for smlad8.ll, but it is unsupported for now. - LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump()); return false; } else if (match(V, m_ZExtOrSExt(m_Value(Val)))) { - if (cast(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) { - LLVM_DEBUG(dbgs() << "No, wrong SrcTy size: " << - cast(I)->getSrcTy()->getIntegerBitWidth() << "\n"); + if (cast(I)->getSrcTy()->getIntegerBitWidth() != MaxBitWidth) return false; - } if (match(Val, m_Load(m_Value()))) { - LLVM_DEBUG(dbgs() << "Yes, found narrow Load:\t"; Val->dump()); VL.push_back(Val); VL.push_back(I); return true; } } - LLVM_DEBUG(dbgs() << "No, unsupported Op:\t"; I->dump()); return false; } template static bool AreSequentialAccesses(MemInst *MemOp0, MemInst *MemOp1, const DataLayout &DL, ScalarEvolution &SE) { - if (!MemOp0->isSimple() || !MemOp1->isSimple()) { - LLVM_DEBUG(dbgs() << "No, not touching volatile access\n"); - return false; - } - if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) { - LLVM_DEBUG(dbgs() << "OK: accesses are consecutive.\n"); + if (isConsecutiveAccess(MemOp0, MemOp1, DL, SE)) return true; - } - LLVM_DEBUG(dbgs() << "No, accesses aren't consecutive.\n"); return false; } @@ -284,19 +288,14 @@ if (!Ld0 || !Ld1) return false; - LLVM_DEBUG(dbgs() << "Are consecutive loads:\n"; + if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1) + return false; + + LLVM_DEBUG(dbgs() << "Loads are sequential and valid:\n"; dbgs() << "Ld0:"; Ld0->dump(); dbgs() << "Ld1:"; Ld1->dump(); ); - if (!Ld0->hasOneUse() || !Ld1->hasOneUse()) { - LLVM_DEBUG(dbgs() << "No, load has more than one use.\n"); - return false; - } - - if (!LoadPairs.count(Ld0) || LoadPairs[Ld0] != Ld1) - return false; - VecMem.clear(); VecMem.push_back(Ld0); VecMem.push_back(Ld1); @@ -305,17 +304,16 @@ /// Iterate through the block and record base, offset pairs of loads as well as /// maximal sequences of sequential loads. -bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *Header) { +bool ARMParallelDSP::RecordSequentialLoads(BasicBlock *BB) { SmallVector Loads; - for (auto &I : *Header) { + for (auto &I : *BB) { auto *Ld = dyn_cast(&I); - if (!Ld) + if (!Ld || !Ld->isSimple() || + !Ld->hasOneUse() || !isa(Ld->user_back())) continue; Loads.push_back(Ld); } - std::map BaseLoads; - for (auto *Ld0 : Loads) { for (auto *Ld1 : Loads) { if (Ld0 == Ld1) @@ -323,17 +321,18 @@ if (AreSequentialAccesses(Ld0, Ld1, *DL, *SE)) { LoadPairs[Ld0] = Ld1; - if (BaseLoads.count(Ld0)) { - LoadInst *Base = BaseLoads[Ld0]; - BaseLoads[Ld1] = Base; - SequentialLoads[Base].push_back(Ld1); - } else { - BaseLoads[Ld1] = Ld0; - SequentialLoads[Ld0].push_back(Ld1); - } + break; } } } + + LLVM_DEBUG(if (!LoadPairs.empty()) { + dbgs() << "Consecutive load pairs:\n"; + for (auto &MapIt : LoadPairs) { + LLVM_DEBUG(dbgs() << *MapIt.first << ", " + << *MapIt.second << "\n"); + } + }); return LoadPairs.size() > 1; } @@ -362,12 +361,11 @@ if (!Ld0 || !Ld1 || !Ld2 || !Ld3) return false; - LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n" - << "\t Ld0: " << *Ld0 << "\n" - << "\t Ld1: " << *Ld1 << "\n" - << "and operands " << x + 2 << ":\n" - << "\t Ld2: " << *Ld2 << "\n" - << "\t Ld3: " << *Ld3 << "\n"); + LLVM_DEBUG(dbgs() << "Loads:\n" + << " - " << *Ld0 << "\n" + << " - " << *Ld1 << "\n" + << " - " << *Ld2 << "\n" + << " - " << *Ld3 << "\n"); if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) { if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) { @@ -416,11 +414,6 @@ assert(PMul0 != PMul1 && "expected different chains"); - LLVM_DEBUG(dbgs() << "\nCheck parallel muls:\n"; - dbgs() << "- "; Mul0->dump(); - dbgs() << "- "; Mul1->dump()); - - LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n"); if (CanPair(PMul0, PMul1)) { Paired.insert(Mul0); Paired.insert(Mul1); @@ -441,9 +434,8 @@ dbgs() << "- "; PMul0->Root->dump(); dbgs() << "- "; PMul1->Root->dump()); - auto *VecLd0 = cast(PMul0->VecLd[0]); - auto *VecLd1 = cast(PMul1->VecLd[0]); - Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, PMul1->Exchange, InsertAfter); + Acc = CreateSMLADCall(PMul0->VecLd, PMul1->VecLd, Acc, PMul1->Exchange, + InsertAfter); InsertAfter = Acc; } @@ -499,14 +491,12 @@ static void AddMACCandidate(OpChainList &Candidates, Instruction *Mul, Value *MulOp0, Value *MulOp1) { - LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump()); assert(Mul->getOpcode() == Instruction::Mul && "expected mul instruction"); ValueList LHS; ValueList RHS; if (IsNarrowSequence<16>(MulOp0, LHS) && IsNarrowSequence<16>(MulOp1, RHS)) { - LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump()); Candidates.push_back(make_unique(Mul, LHS, RHS)); } } @@ -514,7 +504,7 @@ static void MatchParallelMACSequences(Reduction &R, OpChainList &Candidates) { Instruction *Acc = R.AccIntAdd; - LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc); + LLVM_DEBUG(dbgs() << "\n- Analysing:\t" << *Acc << "\n"); // Returns false to signal the search should be stopped. std::function Match = @@ -687,32 +677,81 @@ return Changed; } -static LoadInst *CreateLoadIns(IRBuilder &IRB, LoadInst &BaseLoad, - Type *LoadTy) { - const unsigned AddrSpace = BaseLoad.getPointerAddressSpace(); - - Value *VecPtr = IRB.CreateBitCast(BaseLoad.getPointerOperand(), +LoadInst* ARMParallelDSP::CreateLoadIns(IRBuilder &IRB, + SmallVectorImpl &Loads, + IntegerType *LoadTy) { + assert(Loads.size() == 2 && "currently only support widening two loads"); + + const unsigned AddrSpace = Loads[0]->getPointerAddressSpace(); + Value *VecPtr = IRB.CreateBitCast(Loads[0]->getPointerOperand(), LoadTy->getPointerTo(AddrSpace)); - return IRB.CreateAlignedLoad(LoadTy, VecPtr, BaseLoad.getAlignment()); + LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, + Loads[0]->getAlignment()); + // Fix up users, Loads[0] needs trunc while Loads[1] needs a lshr and trunc. + Instruction *SExt0 = dyn_cast(Loads[0]->user_back()); + Instruction *SExt1 = dyn_cast(Loads[1]->user_back()); + + assert((Loads[0]->hasOneUse() && Loads[1]->hasOneUse() && SExt0 && SExt1) && + "Loads should have a single, extending, user"); + + std::function MoveAfter = + [&](Instruction* Source, Instruction* Sink) -> void { + if (DT->dominates(Source, Sink) || + Source->getParent() != Sink->getParent() || + isa(Source) || isa(Sink)) + return; + + Sink->moveAfter(Source); + for (auto &U : Sink->uses()) + MoveAfter(Sink, cast(U.getUser())); + }; + + // From the wide load, create two values that equal the original two loads. + Value *Bottom = IRB.CreateTrunc(WideLoad, Loads[0]->getType()); + SExt0->setOperand(0, Bottom); + if (auto *I = dyn_cast(Bottom)) { + I->moveAfter(WideLoad); + MoveAfter(I, SExt0); + } + + IntegerType *Ld1Ty = cast(Loads[1]->getType()); + Value *ShiftVal = ConstantInt::get(LoadTy, Ld1Ty->getBitWidth()); + Value *Top = IRB.CreateLShr(WideLoad, ShiftVal); + if (auto *I = dyn_cast(Top)) + MoveAfter(WideLoad, I); + + Value *Trunc = IRB.CreateTrunc(Top, Ld1Ty); + SExt1->setOperand(0, Trunc); + if (auto *I = dyn_cast(Trunc)) + MoveAfter(I, SExt1); + + WideLoads.emplace(std::make_pair(Loads[0], + make_unique(Loads, WideLoad))); + return WideLoad; } -Instruction *ARMParallelDSP::CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1, +Instruction *ARMParallelDSP::CreateSMLADCall(SmallVectorImpl &VecLd0, + SmallVectorImpl &VecLd1, Instruction *Acc, bool Exchange, Instruction *InsertAfter) { LLVM_DEBUG(dbgs() << "Create SMLAD intrinsic using:\n" - << "- " << *VecLd0 << "\n" - << "- " << *VecLd1 << "\n" + << "- " << *VecLd0[0] << "\n" + << "- " << *VecLd0[1] << "\n" + << "- " << *VecLd1[0] << "\n" + << "- " << *VecLd1[1] << "\n" << "- " << *Acc << "\n" - << "Exchange: " << Exchange << "\n"); + << "- Exchange: " << Exchange << "\n"); IRBuilder Builder(InsertAfter->getParent(), ++BasicBlock::iterator(InsertAfter)); // Replace the reduction chain with an intrinsic call - Type *Ty = IntegerType::get(M->getContext(), 32); - LoadInst *NewLd0 = CreateLoadIns(Builder, VecLd0[0], Ty); - LoadInst *NewLd1 = CreateLoadIns(Builder, VecLd1[0], Ty); - Value* Args[] = { NewLd0, NewLd1, Acc }; + IntegerType *Ty = IntegerType::get(M->getContext(), 32); + LoadInst *WideLd0 = WideLoads.count(VecLd0[0]) ? + WideLoads[VecLd0[0]]->getLoad() : CreateLoadIns(Builder, VecLd0, Ty); + LoadInst *WideLd1 = WideLoads.count(VecLd1[0]) ? + WideLoads[VecLd1[0]]->getLoad() : CreateLoadIns(Builder, VecLd1, Ty); + Value* Args[] = { WideLd0, WideLd1, Acc }; Function *SMLAD = nullptr; if (Exchange) SMLAD = Acc->getType()->isIntegerTy(32) ? @@ -740,7 +779,6 @@ } const unsigned Pairs = VL0.size(); - LLVM_DEBUG(dbgs() << "Number of operand pairs: " << Pairs << "\n"); for (unsigned i = 0; i < Pairs; ++i) { const Value *V0 = VL0[i]; @@ -748,24 +786,17 @@ const auto *Inst0 = dyn_cast(V0); const auto *Inst1 = dyn_cast(V1); - LLVM_DEBUG(dbgs() << "Pair " << i << ":\n"; - dbgs() << "mul1: "; V0->dump(); - dbgs() << "mul2: "; V1->dump()); - if (!Inst0 || !Inst1) return false; - if (Inst0->isSameOperationAs(Inst1)) { - LLVM_DEBUG(dbgs() << "OK: same operation found!\n"); + if (Inst0->isSameOperationAs(Inst1)) continue; - } const APInt *C0, *C1; if (!(match(V0, m_APInt(C0)) && match(V1, m_APInt(C1)) && C0 == C1)) return false; } - LLVM_DEBUG(dbgs() << "OK: found symmetrical operand lists.\n"); return true; }; Index: llvm/trunk/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/multi-use-loads.ll @@ -0,0 +1,251 @@ +; RUN: llc -O3 -mtriple=arm-arm-eabi -mcpu=cortex-m33 < %s | FileCheck %s + +; CHECK-LABEL: add_user +; CHECK: %for.body +; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]! +; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]! +; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]] +; CHECK: sxtah [[COUNT:r[0-9]+]], [[COUNT]], [[A]] +define i32 @add_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ] + %res = add i32 %mac1.0.lcssa, %count.final + ret i32 %res + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %count.next = add i32 %conv4, %count + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: mul_bottom_user +; CHECK: %for.body +; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]! +; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]! +; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]] +; CHECK: sxth [[SXT:r[0-9]+]], [[A]] +; CHECK: mul [[COUNT:r[0-9]+]], [[SXT]], [[COUNT]] +define i32 @mul_bottom_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ] + %res = add i32 %mac1.0.lcssa, %count.final + ret i32 %res + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %count.next = mul i32 %conv4, %count + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: mul_top_user +; CHECK: %for.body +; CHECK: ldr [[A:[rl0-9]+]],{{.*}}, #2]! +; CHECK: ldr [[B:[rl0-9]+]],{{.*}}, #2]! +; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]] +; CHECK: asr.w [[ASR:[rl0-9]+]], [[ASR]], #16 +; CHECK: mul [[COUNT:[rl0-9]+]], [[ASR]], [[COUNT]] +define i32 @mul_top_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ] + %res = add i32 %mac1.0.lcssa, %count.final + ret i32 %res + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %mul = mul nsw i32 %conv, %conv4 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %count.next = mul i32 %conv7, %count + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: and_user +; CHECK: %for.body +; CHECK: ldr [[A:r[0-9]+]],{{.*}}, #2]! +; CHECK: ldr [[B:r[0-9]+]],{{.*}}, #2]! +; CHECK: smlad [[ACC:r[0-9]+]], [[B]], [[A]], [[ACC]] +; CHECK: uxth [[UXT:r[0-9]+]], [[A]] +; CHECK: mul [[MUL:r[0-9]+]], [[UXT]], [[MUL]] +define i32 @and_user(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ] + %res = add i32 %mac1.0.lcssa, %count.final + ret i32 %res + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %bottom = and i32 %conv4, 65535 + %mul = mul nsw i32 %conv, %conv4 + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %add11 = add i32 %mul9, %add10 + %count.next = mul i32 %bottom, %count + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} + +; CHECK-LABEL: multi_uses +; CHECK: %for.body +; CHECK: ldr [[A:r[0-9]+]], [{{.*}}, #2]! +; CHECK: ldr [[B:r[0-9]+]], [{{.*}}, #2]! +; CHECK: smlad [[ACC:[rl0-9]+]], [[B]], [[A]], [[ACC]] +; CHECK: sxth [[SXT:r[0-9]+]], [[A]] +; CHECK: eor.w [[EOR:r[0-9]+]], [[SXT]], [[SHIFT:r[0-9]+]] +; CHECK: mul [[MUL:r[0-9]+]], [[EOR]], [[SXT]] +; CHECK: lsl.w [[SHIFT]], [[MUL]], #16 +define i32 @multi_uses(i32 %arg, i32* nocapture readnone %arg1, i16* nocapture readonly %arg2, i16* nocapture readonly %arg3) { +entry: + %cmp24 = icmp sgt i32 %arg, 0 + br i1 %cmp24, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %.pre = load i16, i16* %arg3, align 2 + %.pre27 = load i16, i16* %arg2, align 2 + br label %for.body + +for.cond.cleanup: + %mac1.0.lcssa = phi i32 [ 0, %entry ], [ %add11, %for.body ] + %count.final = phi i32 [ 0, %entry ], [ %count.next, %for.body ] + %res = add i32 %mac1.0.lcssa, %count.final + ret i32 %res + +for.body: + %mac1.026 = phi i32 [ %add11, %for.body ], [ 0, %for.body.preheader ] + %i.025 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ] + %count = phi i32 [ %count.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds i16, i16* %arg3, i32 %i.025 + %0 = load i16, i16* %arrayidx, align 2 + %add = add nuw nsw i32 %i.025, 1 + %arrayidx1 = getelementptr inbounds i16, i16* %arg3, i32 %add + %arrayidx3 = getelementptr inbounds i16, i16* %arg2, i32 %i.025 + %arrayidx6 = getelementptr inbounds i16, i16* %arg2, i32 %add + %1 = load i16, i16* %arrayidx1, align 2 + %2 = load i16, i16* %arrayidx3, align 2 + %conv = sext i16 %2 to i32 + %conv4 = sext i16 %0 to i32 + %bottom = and i32 %conv4, 65535 + %mul = mul nsw i32 %conv, %conv4 + %3 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %3 to i32 + %conv8 = sext i16 %1 to i32 + %mul9 = mul nsw i32 %conv7, %conv8 + %add10 = add i32 %mul, %mac1.026 + %shl = shl i32 %conv4, 16 + %add11 = add i32 %mul9, %add10 + %xor = xor i32 %bottom, %count + %count.next = mul i32 %xor, %shl + %exitcond = icmp ne i32 %add, %arg + br i1 %exitcond, label %for.body, label %for.cond.cleanup +} Index: llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad0.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad0.ll +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/smlad0.ll @@ -210,3 +210,4 @@ %exitcond = icmp ne i32 %add, %arg br i1 %exitcond, label %for.body, label %for.cond.cleanup } + Index: llvm/trunk/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll =================================================================== --- llvm/trunk/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll +++ llvm/trunk/test/CodeGen/ARM/ParallelDSP/unroll-n-jam-smlad.ll @@ -0,0 +1,217 @@ +; RUN: llc -O3 -mtriple=thumbv7em %s -o - | FileCheck %s +; RUN: llc -O3 -mtriple=thumbv8m.main -mattr=+dsp %s -o - | FileCheck %s + +; Test that the duplicate loads are removed, which allows parallel dsp to find +; the parallel operations. + +define void @unroll_n_jam_smlad(i32* %res, i16* %A, i16* %B, i32 %N, i32 %idx) { +entry: + %xtraiter306.i = and i32 %N, 3 + %unroll_iter310.i = sub i32 %N, %xtraiter306.i + %arrayidx.us.i117.i = getelementptr inbounds i32, i32* %res, i32 %idx + store i32 0, i32* %arrayidx.us.i117.i, align 4 + %mul.us.i118.i = mul i32 %idx, %N + %inc11.us.i.i = or i32 %idx, 1 + %arrayidx.us.i117.1.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.i + store i32 0, i32* %arrayidx.us.i117.1.i, align 4 + %mul.us.i118.1.i = mul i32 %inc11.us.i.i, %N + %inc11.us.i.1.i = or i32 %idx, 2 + %arrayidx.us.i117.2.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.1.i + store i32 0, i32* %arrayidx.us.i117.2.i, align 4 + %mul.us.i118.2.i = mul i32 %inc11.us.i.1.i, %N + %inc11.us.i.2.i = or i32 %idx, 3 + %arrayidx.us.i117.3.i = getelementptr inbounds i32, i32* %res, i32 %inc11.us.i.2.i + store i32 0, i32* %arrayidx.us.i117.3.i, align 4 + %mul.us.i118.3.i = mul i32 %inc11.us.i.2.i, %N + %inc11.us.i.3.i = add i32 %idx, 4 + br label %for.body + +; CHECK: %for.body +; CHECK: smlad +; CHECK: smlad +; CHECK: smlad +; CHECK: smlad +; CHECK: smlad +; CHECK: smlad +; CHECK: smlad +; CHECK: smlad + +for.body: + %A3 = phi i32 [ %add9.us.i.3361.i, %for.body ], [ 0, %entry ] + %j.026.us.i.i = phi i32 [ %inc.us.i.3362.i, %for.body ], [ 0, %entry ] + %A4 = phi i32 [ %add9.us.i.1.3.i, %for.body ], [ 0, %entry ] + %A5 = phi i32 [ %add9.us.i.2.3.i, %for.body ], [ 0, %entry ] + %A6 = phi i32 [ %add9.us.i.3.3.i, %for.body ], [ 0, %entry ] + %niter335.i = phi i32 [ %niter335.nsub.3.i, %for.body ], [ %unroll_iter310.i, %entry ] + %add.us.i.i = add i32 %j.026.us.i.i, %mul.us.i118.i + %arrayidx4.us.i.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.i + %A7 = load i16, i16* %arrayidx4.us.i.i, align 2 + %conv.us.i.i = sext i16 %A7 to i32 + %arrayidx5.us.i.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i + %A8 = load i16, i16* %arrayidx5.us.i.i, align 2 + %conv6.us.i.i = sext i16 %A8 to i32 + %mul7.us.i.i = mul nsw i32 %conv6.us.i.i, %conv.us.i.i + %add9.us.i.i = add nsw i32 %mul7.us.i.i, %A3 + %inc.us.i.i = or i32 %j.026.us.i.i, 1 + %add.us.i.1.i = add i32 %j.026.us.i.i, %mul.us.i118.1.i + %arrayidx4.us.i.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.i + %A9 = load i16, i16* %arrayidx4.us.i.1.i, align 2 + %conv.us.i.1.i = sext i16 %A9 to i32 + %arrayidx5.us.i.1.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i + %B0 = load i16, i16* %arrayidx5.us.i.1.i, align 2 + %conv6.us.i.1.i = sext i16 %B0 to i32 + %mul7.us.i.1.i = mul nsw i32 %conv6.us.i.1.i, %conv.us.i.1.i + %add9.us.i.1.i = add nsw i32 %mul7.us.i.1.i, %A4 + %inc.us.i.1.i = or i32 %j.026.us.i.i, 1 + %add.us.i.2.i = add i32 %j.026.us.i.i, %mul.us.i118.2.i + %arrayidx4.us.i.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.i + %B1 = load i16, i16* %arrayidx4.us.i.2.i, align 2 + %conv.us.i.2.i = sext i16 %B1 to i32 + %arrayidx5.us.i.2.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i + %B2 = load i16, i16* %arrayidx5.us.i.2.i, align 2 + %conv6.us.i.2.i = sext i16 %B2 to i32 + %mul7.us.i.2.i = mul nsw i32 %conv6.us.i.2.i, %conv.us.i.2.i + %add9.us.i.2.i = add nsw i32 %mul7.us.i.2.i, %A5 + %inc.us.i.2.i = or i32 %j.026.us.i.i, 1 + %add.us.i.3.i = add i32 %j.026.us.i.i, %mul.us.i118.3.i + %arrayidx4.us.i.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.i + %B3 = load i16, i16* %arrayidx4.us.i.3.i, align 2 + %conv.us.i.3.i = sext i16 %B3 to i32 + %arrayidx5.us.i.3.i = getelementptr inbounds i16, i16* %B, i32 %j.026.us.i.i + %B4 = load i16, i16* %arrayidx5.us.i.3.i, align 2 + %conv6.us.i.3.i = sext i16 %B4 to i32 + %mul7.us.i.3.i = mul nsw i32 %conv6.us.i.3.i, %conv.us.i.3.i + %add9.us.i.3.i = add nsw i32 %mul7.us.i.3.i, %A6 + %inc.us.i.3.i = or i32 %j.026.us.i.i, 1 + %add.us.i.1337.i = add i32 %inc.us.i.i, %mul.us.i118.i + %arrayidx4.us.i.1338.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1337.i + %B5 = load i16, i16* %arrayidx4.us.i.1338.i, align 2 + %conv.us.i.1339.i = sext i16 %B5 to i32 + %arrayidx5.us.i.1340.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.i + %B6 = load i16, i16* %arrayidx5.us.i.1340.i, align 2 + %conv6.us.i.1341.i = sext i16 %B6 to i32 + %mul7.us.i.1342.i = mul nsw i32 %conv6.us.i.1341.i, %conv.us.i.1339.i + %add9.us.i.1343.i = add nsw i32 %mul7.us.i.1342.i, %add9.us.i.i + %inc.us.i.1344.i = or i32 %j.026.us.i.i, 2 + %add.us.i.1.1.i = add i32 %inc.us.i.1.i, %mul.us.i118.1.i + %arrayidx4.us.i.1.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.1.i + %B7 = load i16, i16* %arrayidx4.us.i.1.1.i, align 2 + %conv.us.i.1.1.i = sext i16 %B7 to i32 + %arrayidx5.us.i.1.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.i + %B6.dup = load i16, i16* %arrayidx5.us.i.1.1.i, align 2 + %conv6.us.i.1.1.i = sext i16 %B6.dup to i32 + %mul7.us.i.1.1.i = mul nsw i32 %conv6.us.i.1.1.i, %conv.us.i.1.1.i + %add9.us.i.1.1.i = add nsw i32 %mul7.us.i.1.1.i, %add9.us.i.1.i + %inc.us.i.1.1.i = or i32 %j.026.us.i.i, 2 + %add.us.i.2.1.i = add i32 %inc.us.i.2.i, %mul.us.i118.2.i + %arrayidx4.us.i.2.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.1.i + %B9 = load i16, i16* %arrayidx4.us.i.2.1.i, align 2 + %conv.us.i.2.1.i = sext i16 %B9 to i32 + %arrayidx5.us.i.2.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.i + %B6.dup.i = load i16, i16* %arrayidx5.us.i.2.1.i, align 2 + %conv6.us.i.2.1.i = sext i16 %B6.dup.i to i32 + %mul7.us.i.2.1.i = mul nsw i32 %conv6.us.i.2.1.i, %conv.us.i.2.1.i + %add9.us.i.2.1.i = add nsw i32 %mul7.us.i.2.1.i, %add9.us.i.2.i + %inc.us.i.2.1.i = or i32 %j.026.us.i.i, 2 + %add.us.i.3.1.i = add i32 %inc.us.i.3.i, %mul.us.i118.3.i + %arrayidx4.us.i.3.1.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.1.i + %B11 = load i16, i16* %arrayidx4.us.i.3.1.i, align 2 + %conv.us.i.3.1.i = sext i16 %B11 to i32 + %arrayidx5.us.i.3.1.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.i + %B6.dup.i.i = load i16, i16* %arrayidx5.us.i.3.1.i, align 2 + %conv6.us.i.3.1.i = sext i16 %B6.dup.i.i to i32 + %mul7.us.i.3.1.i = mul nsw i32 %conv6.us.i.3.1.i, %conv.us.i.3.1.i + %add9.us.i.3.1.i = add nsw i32 %mul7.us.i.3.1.i, %add9.us.i.3.i + %inc.us.i.3.1.i = or i32 %j.026.us.i.i, 2 + %add.us.i.2346.i = add i32 %inc.us.i.1344.i, %mul.us.i118.i + %arrayidx4.us.i.2347.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2346.i + %B13 = load i16, i16* %arrayidx4.us.i.2347.i, align 2 + %conv.us.i.2348.i = sext i16 %B13 to i32 + %arrayidx5.us.i.2349.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1344.i + %B14 = load i16, i16* %arrayidx5.us.i.2349.i, align 2 + %conv6.us.i.2350.i = sext i16 %B14 to i32 + %mul7.us.i.2351.i = mul nsw i32 %conv6.us.i.2350.i, %conv.us.i.2348.i + %add9.us.i.2352.i = add nsw i32 %mul7.us.i.2351.i, %add9.us.i.1343.i + %inc.us.i.2353.i = or i32 %j.026.us.i.i, 3 + %add.us.i.1.2.i = add i32 %inc.us.i.1.1.i, %mul.us.i118.1.i + %arrayidx4.us.i.1.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.2.i + %B15 = load i16, i16* %arrayidx4.us.i.1.2.i, align 2 + %conv.us.i.1.2.i = sext i16 %B15 to i32 + %arrayidx5.us.i.1.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.1.i + %B14.dup = load i16, i16* %arrayidx5.us.i.1.2.i, align 2 + %conv6.us.i.1.2.i = sext i16 %B14.dup to i32 + %mul7.us.i.1.2.i = mul nsw i32 %conv6.us.i.1.2.i, %conv.us.i.1.2.i + %add9.us.i.1.2.i = add nsw i32 %mul7.us.i.1.2.i, %add9.us.i.1.1.i + %inc.us.i.1.2.i = or i32 %j.026.us.i.i, 3 + %add.us.i.2.2.i = add i32 %inc.us.i.2.1.i, %mul.us.i118.2.i + %arrayidx4.us.i.2.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.2.i + %B17 = load i16, i16* %arrayidx4.us.i.2.2.i, align 2 + %conv.us.i.2.2.i = sext i16 %B17 to i32 + %arrayidx5.us.i.2.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.1.i + %B14.dup.i = load i16, i16* %arrayidx5.us.i.2.2.i, align 2 + %conv6.us.i.2.2.i = sext i16 %B14.dup.i to i32 + %mul7.us.i.2.2.i = mul nsw i32 %conv6.us.i.2.2.i, %conv.us.i.2.2.i + %add9.us.i.2.2.i = add nsw i32 %mul7.us.i.2.2.i, %add9.us.i.2.1.i + %inc.us.i.2.2.i = or i32 %j.026.us.i.i, 3 + %add.us.i.3.2.i = add i32 %inc.us.i.3.1.i, %mul.us.i118.3.i + %arrayidx4.us.i.3.2.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.2.i + %B19 = load i16, i16* %arrayidx4.us.i.3.2.i, align 2 + %conv.us.i.3.2.i = sext i16 %B19 to i32 + %arrayidx5.us.i.3.2.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.1.i + %B14.dup.i.i = load i16, i16* %arrayidx5.us.i.3.2.i, align 2 + %conv6.us.i.3.2.i = sext i16 %B14.dup.i.i to i32 + %mul7.us.i.3.2.i = mul nsw i32 %conv6.us.i.3.2.i, %conv.us.i.3.2.i + %add9.us.i.3.2.i = add nsw i32 %mul7.us.i.3.2.i, %add9.us.i.3.1.i + %inc.us.i.3.2.i = or i32 %j.026.us.i.i, 3 + %add.us.i.3355.i = add i32 %inc.us.i.2353.i, %mul.us.i118.i + %arrayidx4.us.i.3356.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3355.i + %B21 = load i16, i16* %arrayidx4.us.i.3356.i, align 2 + %conv.us.i.3357.i = sext i16 %B21 to i32 + %arrayidx5.us.i.3358.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2353.i + %B22 = load i16, i16* %arrayidx5.us.i.3358.i, align 2 + %conv6.us.i.3359.i = sext i16 %B22 to i32 + %mul7.us.i.3360.i = mul nsw i32 %conv6.us.i.3359.i, %conv.us.i.3357.i + %add9.us.i.3361.i = add nsw i32 %mul7.us.i.3360.i, %add9.us.i.2352.i + %inc.us.i.3362.i = add i32 %j.026.us.i.i, 4 + %add.us.i.1.3.i = add i32 %inc.us.i.1.2.i, %mul.us.i118.1.i + %arrayidx4.us.i.1.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.1.3.i + %B23 = load i16, i16* %arrayidx4.us.i.1.3.i, align 2 + %conv.us.i.1.3.i = sext i16 %B23 to i32 + %arrayidx5.us.i.1.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.1.2.i + %B22.dup = load i16, i16* %arrayidx5.us.i.1.3.i, align 2 + %conv6.us.i.1.3.i = sext i16 %B22.dup to i32 + %mul7.us.i.1.3.i = mul nsw i32 %conv6.us.i.1.3.i, %conv.us.i.1.3.i + %add9.us.i.1.3.i = add nsw i32 %mul7.us.i.1.3.i, %add9.us.i.1.2.i + %add.us.i.2.3.i = add i32 %inc.us.i.2.2.i, %mul.us.i118.2.i + %arrayidx4.us.i.2.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.2.3.i + %B25 = load i16, i16* %arrayidx4.us.i.2.3.i, align 2 + %conv.us.i.2.3.i = sext i16 %B25 to i32 + %arrayidx5.us.i.2.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.2.2.i + %B22.dup.i = load i16, i16* %arrayidx5.us.i.2.3.i, align 2 + %conv6.us.i.2.3.i = sext i16 %B22.dup.i to i32 + %mul7.us.i.2.3.i = mul nsw i32 %conv6.us.i.2.3.i, %conv.us.i.2.3.i + %add9.us.i.2.3.i = add nsw i32 %mul7.us.i.2.3.i, %add9.us.i.2.2.i + %add.us.i.3.3.i = add i32 %inc.us.i.3.2.i, %mul.us.i118.3.i + %arrayidx4.us.i.3.3.i = getelementptr inbounds i16, i16* %A, i32 %add.us.i.3.3.i + %B27 = load i16, i16* %arrayidx4.us.i.3.3.i, align 2 + %conv.us.i.3.3.i = sext i16 %B27 to i32 + %arrayidx5.us.i.3.3.i = getelementptr inbounds i16, i16* %B, i32 %inc.us.i.3.2.i + %B22.dup.i.i = load i16, i16* %arrayidx5.us.i.3.3.i, align 2 + %conv6.us.i.3.3.i = sext i16 %B22.dup.i.i to i32 + %mul7.us.i.3.3.i = mul nsw i32 %conv6.us.i.3.3.i, %conv.us.i.3.3.i + %add9.us.i.3.3.i = add nsw i32 %mul7.us.i.3.3.i, %add9.us.i.3.2.i + %niter335.nsub.3.i = add i32 %niter335.i, -4 + %niter335.ncmp.3.i = icmp eq i32 %niter335.nsub.3.i, 0 + br i1 %niter335.ncmp.3.i, label %exit, label %for.body + +exit: + %arrayidx.out.i = getelementptr inbounds i32, i32* %res, i32 0 + store i32 %add9.us.i.3361.i, i32* %arrayidx.out.i, align 4 + %arrayidx.out.1.i = getelementptr inbounds i32, i32* %res, i32 1 + store i32 %add9.us.i.1.3.i, i32* %arrayidx.out.1.i, align 4 + %arrayidx.out.2.i = getelementptr inbounds i32, i32* %res, i32 2 + store i32 %add9.us.i.2.3.i, i32* %arrayidx.out.2.i, align 4 + %arrayidx.out.3.i = getelementptr inbounds i32, i32* %res, i32 3 + store i32 %add9.us.i.3.3.i, i32* %arrayidx.out.3.i, align 4 + ret void +}