Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -365,9 +365,10 @@ TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), - SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), + SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); + MaxRequiredIntegerTy = nullptr; } /// \brief Vectorize the tree that starts with the elements in \p VL. @@ -399,6 +400,7 @@ BlockScheduling *BS = Iter.second.get(); BS->clear(); } + MaxRequiredIntegerTy = nullptr; } /// \returns true if the memory operations A and B are consecutive. @@ -412,6 +414,15 @@ return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; } + /// \return The vector element size in bits to use when vectorizing the + /// epression tree ending at \p V. This method is used by the vectorizer to + /// calculate vectorization factors. + unsigned getVectorElementSize(Value *V); + + /// Compute the maximum width integer type required to represent the result + /// of a scalar expression, if such a type exists. + void computeMaxRequiredIntegerTy(); + private: struct TreeEntry; @@ -917,8 +928,12 @@ AliasAnalysis *AA; LoopInfo *LI; DominatorTree *DT; + AssumptionCache *AC; /// Instruction builder to construct the vectorized tree. IRBuilder<> Builder; + + // The maximum width integer type required to represent a scalar expression. + IntegerType *MaxRequiredIntegerTy; }; #ifndef NDEBUG @@ -1474,6 +1489,15 @@ ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + if (MaxRequiredIntegerTy) { + auto *IT = dyn_cast<IntegerType>(ScalarTy); + assert(IT && "Computed smaller type for non-integer value?"); + if (MaxRequiredIntegerTy->getBitWidth() < IT->getBitWidth()) + VecTy = VectorType::get(MaxRequiredIntegerTy, VL.size()); + } + if (E->NeedToGather) { if (allConstant(VL)) return 0; @@ -1803,9 +1827,17 @@ if (EphValues.count(I->User)) continue; - VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); - ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - I->Lane); + // If we plan to rewrite the tree in a smaller type, we will need to sign + // extend the extracted value back to the original type. Here, we account + // for the extract and the added cost of the sign extend if needed. + auto *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth); + if (MaxRequiredIntegerTy) { + VecTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth); + ExtractCost += TTI->getCastInstrCost( + Instruction::SExt, I->Scalar->getType(), MaxRequiredIntegerTy); + } + ExtractCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I->Lane); } Cost += getSpillCost(); @@ -2560,7 +2592,19 @@ } Builder.SetInsertPoint(&F->getEntryBlock().front()); - vectorizeTree(&VectorizableTree[0]); + auto *VectorRoot = vectorizeTree(&VectorizableTree[0]); + + // If the vectorized tree can be rewritten in a smaller type, we truncate the + // vectorized root. InstCombine will then rewrite the entire expression. We + // sign extend the extracted values below. + if (MaxRequiredIntegerTy) { + BasicBlock::iterator I(cast<Instruction>(VectorRoot)); + Builder.SetInsertPoint(&*++I); + auto BundleWidth = VectorizableTree[0].Scalars.size(); + auto *SmallerTy = VectorType::get(MaxRequiredIntegerTy, BundleWidth); + auto *Trunc = Builder.CreateTrunc(VectorRoot, SmallerTy); + VectorizableTree[0].VectorizedValue = Trunc; + } DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); @@ -2593,6 +2637,8 @@ if (PH->getIncomingValue(i) == Scalar) { Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); + if (MaxRequiredIntegerTy) + Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(PH->getIncomingBlock(i)); PH->setOperand(i, Ex); } @@ -2600,12 +2646,16 @@ } else { Builder.SetInsertPoint(cast<Instruction>(User)); Value *Ex = Builder.CreateExtractElement(Vec, Lane); + if (MaxRequiredIntegerTy) + Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(cast<Instruction>(User)->getParent()); User->replaceUsesOfWith(Scalar, Ex); } } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); Value *Ex = Builder.CreateExtractElement(Vec, Lane); + if (MaxRequiredIntegerTy) + Ex = Builder.CreateSExt(Ex, Scalar->getType()); CSEBlocks.insert(&F->getEntryBlock()); User->replaceUsesOfWith(Scalar, Ex); } @@ -3140,10 +3190,133 @@ BS->ScheduleStart = nullptr; } +unsigned BoUpSLP::getVectorElementSize(Value *V) { + auto &DL = F->getParent()->getDataLayout(); + + // If V is a store, just return the width of the stored value without + // traversing the expression tree. This is the common case. + if (auto *Store = dyn_cast<StoreInst>(V)) + return DL.getTypeSizeInBits(Store->getValueOperand()->getType()); + + // If V is not a store, we can traverse the expression tree to find loads + // that feed it. The type of the loaded value may indicate a more suitable + // width than V's type. We want to base the vector element size on the width + // of memory operations where possible. + SmallVector<Instruction *, 16> Worklist; + SmallPtrSet<Instruction *, 16> Visited; + if (auto *I = dyn_cast<Instruction>(V)) + Worklist.push_back(I); + + // Traverse the expression tree in bottom-up order looking for loads. If we + // encounter an instruciton we don't yet handle, we give up. + auto MaxWidth = 0u; + auto FoundUnknownInst = false; + while (!Worklist.empty() && !FoundUnknownInst) { + auto *I = Worklist.pop_back_val(); + Visited.insert(I); + + // We should only be looking at scalar instructions here. If the current + // instruction has a vector type, give up. + auto *Ty = I->getType(); + if (isa<VectorType>(Ty)) + FoundUnknownInst = true; + + // If the current instruction is a load, update MaxWidth to reflect the + // width of the loaded value. + else if (isa<LoadInst>(I)) + MaxWidth = std::max<unsigned>(MaxWidth, DL.getTypeSizeInBits(Ty)); + + // Otherwise, we need to visit the operands of the instruction. We only + // handle the interesting cases from buildTree here. If an operand is an + // instruction we haven't yet visited, we add it to the worklist. + else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) || + isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) { + for (Use &U : I->operands()) + if (auto *J = dyn_cast<Instruction>(U.get())) + if (!Visited.count(J)) + Worklist.push_back(J); + } + + // If we don't yet handle the instruction, give up. + else + FoundUnknownInst = true; + } + + // If we didn't encounter a memory access in the expression tree, or if we + // gave up for some reason, just return the width of V. + if (!MaxWidth || FoundUnknownInst) + return DL.getTypeSizeInBits(V->getType()); + + // Otherwise, return the maximum width we found. + return MaxWidth; +} + +void BoUpSLP::computeMaxRequiredIntegerTy() { + + // If there are no external uses, there is nothing to do here. + if (ExternalUses.empty()) + return; + + // We only want to truncate the root of the expression tree to avoid having + // to deal with other external uses. The code below ensures that only the + // roots are used externally. + auto &TreeRoot = VectorizableTree[0].Scalars; + SmallPtrSet<Value *, 16> ScalarRoots(TreeRoot.begin(), TreeRoot.end()); + for (auto &EU : ExternalUses) + if (!ScalarRoots.erase(EU.Scalar)) + return; + if (!ScalarRoots.empty()) + return; + + // The maximum bit width required to represent all the instructions in the + // tree without loss of precision. It would be safe to truncate the + // expression to this width. + auto MaxBitWidth = 0u; + + // Look at each entry in the tree. + for (auto &Entry : VectorizableTree) { + + // Get a representative value for the vectorizable bundle. All values in + // Entry.Scalars should be isomorphic. + auto *Scalar = Entry.Scalars[0]; + + // We will rely on InstCombine to rewrite the expression in the narrower + // type. However, InstCombine only rewrites single-use values. If the + // scalar is used more than once, give up. + if (!Scalar->hasOneUse()) + return; + + // We only compute smaller integer types. If the scalar has a different + // type, give up. + auto *IT = dyn_cast<IntegerType>(Scalar->getType()); + if (!IT) + return; + + // Compute the maximum bit width required to store the scalar. We use + // ValueTracking to compute the number of high-order bits we can truncate. + // We then round up to the next power-of-two. + auto &DL = F->getParent()->getDataLayout(); + auto NumSignBits = ComputeNumSignBits(Scalar, DL, 0, AC, 0, DT); + auto NumTypeBits = IT->getBitWidth(); + auto BitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, 8u); + if (!isPowerOf2_64(BitWidth)) + BitWidth = NextPowerOf2(BitWidth); + MaxBitWidth = std::max(BitWidth, MaxBitWidth); + } + + // If the maximum bit width we compute is less than the with of the roots' + // type, we can proceed with the narrowing. Otherwise, do nothing. + auto *RootIT = cast<IntegerType>(TreeRoot[0]->getType()); + if (MaxBitWidth > 0 && MaxBitWidth < RootIT->getBitWidth()) + MaxRequiredIntegerTy = IntegerType::get(F->getContext(), MaxBitWidth); +} + /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { typedef SmallVector<StoreInst *, 8> StoreList; typedef MapVector<Value *, StoreList> StoreListMap; + typedef SmallVector<WeakVH, 8> LoadList; + typedef MapVector<Value *, LoadList> LoadListMap; /// Pass identification, replacement for typeid static char ID; @@ -3174,6 +3347,7 @@ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); StoreRefs.clear(); + LoadRefs.clear(); bool Changed = false; // If the target claims to have no vector registers don't attempt @@ -3207,15 +3381,23 @@ // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { + collectMemAccesses(BB); + // Vectorize trees that end at stores. - if (unsigned count = collectStores(BB, R)) { - (void)count; - DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); + if (NumStores > 0) { + DEBUG(dbgs() << "SLP: Found " << NumStores << " stores.\n"); Changed |= vectorizeStoreChains(R); } // Vectorize trees that end at reductions. Changed |= vectorizeChainsInBlock(BB, R); + + // Vectorize trees that end at loads. This is intended to catch + // gather-like idioms ending at non-consecutive loads. + if (NumLoads > 0) { + DEBUG(dbgs() << "SLP: Found " << NumLoads << " loads.\n"); + Changed |= vectorizeLoadChains(BB, R); + } } if (Changed) { @@ -3242,12 +3424,13 @@ } private: - - /// \brief Collect memory references and sort them according to their base - /// object. We sort the stores to their base objects to reduce the cost of the - /// quadratic search on the stores. TODO: We can further reduce this cost - /// if we flush the chain creation every time we run into a memory barrier. - unsigned collectStores(BasicBlock *BB, BoUpSLP &R); + /// \brief Collect store and load instructions and organize them in StoreRefs + /// and LoadRefs according to their base object. We sort the memory accesses + /// by their base objects to reduce the cost of consecutive access queries. + /// + /// TODO: We can further reduce this cost if we flush the chain creation + /// every time we run into a memory barrier. + void collectMemAccesses(BasicBlock *BB); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); @@ -3266,6 +3449,9 @@ /// \brief Vectorize the stores that were collected in StoreRefs. bool vectorizeStoreChains(BoUpSLP &R); + /// \brief Vectorize the loads that were collected in LoadRefs. + bool vectorizeLoadChains(BasicBlock *BB, BoUpSLP &R); + /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); @@ -3275,8 +3461,19 @@ bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold, BoUpSLP &R); -private: + + /// The store instructions in a basic block organized by base pointer. StoreListMap StoreRefs; + + /// The store instructions in a basic block organized by base pointer. + LoadListMap LoadRefs; + + /// The number of stores in a basic block. + unsigned NumStores; + + /// The number of loads in a basic block. + unsigned NumLoads; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. }; @@ -3297,9 +3494,7 @@ unsigned ChainLen = Chain.size(); DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); - Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType(); - auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout(); - unsigned Sz = DL.getTypeSizeInBits(StoreTy); + unsigned Sz = R.getVectorElementSize(Chain[0]); unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) @@ -3323,6 +3518,7 @@ ArrayRef<Value *> Operands = Chain.slice(i, VF); R.buildTree(Operands); + R.computeMaxRequiredIntegerTy(); int Cost = R.getTreeCost(); @@ -3410,33 +3606,34 @@ return Changed; } +void SLPVectorizer::collectMemAccesses(BasicBlock *BB) { -unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { - unsigned count = 0; + // Inialize the collection. StoreRefs.clear(); + LoadRefs.clear(); + NumStores = NumLoads = 0; const DataLayout &DL = BB->getModule()->getDataLayout(); - for (Instruction &I : *BB) { - StoreInst *SI = dyn_cast<StoreInst>(&I); - if (!SI) - continue; - - // Don't touch volatile stores. - if (!SI->isSimple()) - continue; - - // Check that the pointer points to scalars. - Type *Ty = SI->getValueOperand()->getType(); - if (!isValidElementType(Ty)) - continue; - - // Find the base pointer. - Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); - // Save the store locations. - StoreRefs[Ptr].push_back(SI); - count++; + // Visit the store and load instructions in BB and organize them in StoreRefs + // and LoadRefs according to their base pointers. We ignore volatile accesses + // and accesses whose pointer operand doesn't point to scalars. + for (Instruction &I : *BB) { + if (auto *SI = dyn_cast<StoreInst>(&I)) { + if (!SI->isSimple()) + continue; + if (!isValidElementType(SI->getValueOperand()->getType())) + continue; + StoreRefs[GetUnderlyingObject(SI->getPointerOperand(), DL)].push_back(SI); + ++NumStores; + } else if (auto *LI = dyn_cast<LoadInst>(&I)) { + if (!LI->isSimple()) + continue; + if (!isValidElementType(LI->getType())) + continue; + LoadRefs[GetUnderlyingObject(LI->getPointerOperand(), DL)].push_back(LI); + ++NumLoads; + } } - return count; } bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { @@ -3460,12 +3657,10 @@ return false; unsigned Opcode0 = I0->getOpcode(); - const DataLayout &DL = I0->getModule()->getDataLayout(); - Type *Ty0 = I0->getType(); - unsigned Sz = DL.getTypeSizeInBits(Ty0); // FIXME: Register size should be a parameter to this function, so we can // try different vectorization factors. + unsigned Sz = R.getVectorElementSize(I0); unsigned VF = MinVecRegSize / Sz; for (Value *V : VL) { @@ -3514,6 +3709,7 @@ Value *ReorderedOps[] = { Ops[1], Ops[0] }; R.buildTree(ReorderedOps, None); } + R.computeMaxRequiredIntegerTy(); int Cost = R.getTreeCost(); if (Cost < -SLPCostThreshold) { @@ -3780,6 +3976,7 @@ for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps); + V.computeMaxRequiredIntegerTy(); // Estimate cost. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); @@ -4185,6 +4382,90 @@ return Changed; } +bool SLPVectorizer::vectorizeLoadChains(BasicBlock *BB, BoUpSLP &R) { + auto Changed = false; + auto &DL = BB->getModule()->getDataLayout(); + for (auto &Entry : LoadRefs) { + + // If the load list has fewer than two elements, there's nothing to do. + if (Entry.second.size() < 2) + continue; + + DEBUG(dbgs() << "SLP: Analyzing a load chain of length " + << Entry.second.size() << ".\n"); + + // Try and vectorize trees ending at non-consecutive loads. We are currenly + // only interested in gather-like cases of the form: + // + // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... + // + // where the loads of "a", the loads of "b", and the subtractions can be + // performed in parallel. It's likely that detecting this pattern + // in a bottom-up phase will be simpler and less costly than building a + // full-blown top-down phase beginning at the consecutive loads. + SmallVector<Value *, 16> Bundle; + SmallVector<LoadInst *, 16> Loads; + for (auto VH : Entry.second) { + + // The load may have been vectorized after we initially collected it. If + // so, the WeakVH will have nullified the value. + auto *Load = dyn_cast_or_null<LoadInst>(VH); + if (!Load) + break; + + // We only handle loads in which the pointer operand is a getelementptr. + // If we encounter anything else, give up. + auto *GEP = dyn_cast<GetElementPtrInst>(Load->getPointerOperand()); + if (!GEP) + break; + + // Furthermore, we only handle getelementptr instructions having a single + // non-constant index. Constant indices are not interesting, and we limit + // ourselves to getelementptr's with a single index to save compile time. + // If we encounter anything else, give up. + // + // TODO: Relax these restrictions if we find an interesting case that we + // miss and compile time is not an issue. + auto *Idx = GEP->idx_begin()->get(); + if (GEP->getNumIndices() > 1 || isa<Constant>(Idx)) + break; + + // Add the getelementptr's index to the bundle and record its pointer + // operand. We will ensure the loads are nonconsecutive in the final step + // since this check is the most expensive. + Bundle.push_back(Idx); + Loads.push_back(Load); + } + + // If the bundle has fewer than two elements, there's nothing to do. + if (Bundle.size() < 2) + continue; + + // We process the bundle in chunks of 16 (like we do for stores) to + // minimize compile-time. + for (unsigned BI = 0, BE = Bundle.size(); BI < BE; BI += 16) { + auto Len = std::min<unsigned>(BE - BI, 16); + + // If the load list accesses contiguous memory, it doesn't make sense to + // pursue the bundle further in a bottom-up phase. The load list should + // have already been vectorized if profitable. The code below gives up if + // a single pair of consecutive load accesses is found. + auto IsConsecutive = false; + for (unsigned LI = BI; LI < BI + Len && !IsConsecutive; ++LI) + for (unsigned LJ = BI; LJ < BI + Len && !IsConsecutive; ++LJ) + if (LI != LJ) + if (R.isConsecutiveAccess(Loads[LI], Loads[LJ], DL)) + IsConsecutive = true; + if (IsConsecutive) + continue; + + // Try to vectorize the chunk. + Changed |= tryToVectorizeList(makeArrayRef(&Bundle[BI], Len), R); + } + } + return Changed; +} + bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. Index: test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -0,0 +1,262 @@ +; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; These tests check that we vectorize the index calculations in the +; gather-reduce pattern shown below. We check cases having i32 and i64 +; subtraction. +; +; int gather_reduce_8x16(short *a, short *b, short *g, int n) { +; int sum = 0; +; for (int i = 0; i < n ; ++i) { +; sum += g[*a++ - *b++]; sum += g[*a++ - *b++]; +; sum += g[*a++ - *b++]; sum += g[*a++ - *b++]; +; sum += g[*a++ - *b++]; sum += g[*a++ - *b++]; +; sum += g[*a++ - *b++]; sum += g[*a++ - *b++]; +; } +; return sum; +; } + +; CHECK-LABEL: @gather_reduce_8x16_i32 +; +; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> +; CHECK: zext <8 x i16> [[L]] to <8 x i32> +; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> +; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] +; CHECK: sext i32 [[X]] to i64 +; +define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { +entry: + %cmp.99 = icmp sgt i32 %n, 0 + br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ] + %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ] + %b.addr.0100 = phi i16* [ %incdec.ptr60, %for.body ], [ %b, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1 + %0 = load i16, i16* %a.addr.0101, align 2 + %conv = zext i16 %0 to i32 + %incdec.ptr1 = getelementptr inbounds i16, i16* %b.addr.0100, i64 1 + %1 = load i16, i16* %b.addr.0100, align 2 + %conv2 = zext i16 %1 to i32 + %sub = sub nsw i32 %conv, %conv2 + %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub + %2 = load i16, i16* %arrayidx, align 2 + %conv3 = zext i16 %2 to i32 + %add = add nsw i32 %conv3, %sum.0102 + %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2 + %3 = load i16, i16* %incdec.ptr, align 2 + %conv5 = zext i16 %3 to i32 + %incdec.ptr6 = getelementptr inbounds i16, i16* %b.addr.0100, i64 2 + %4 = load i16, i16* %incdec.ptr1, align 2 + %conv7 = zext i16 %4 to i32 + %sub8 = sub nsw i32 %conv5, %conv7 + %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8 + %5 = load i16, i16* %arrayidx10, align 2 + %conv11 = zext i16 %5 to i32 + %add12 = add nsw i32 %add, %conv11 + %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3 + %6 = load i16, i16* %incdec.ptr4, align 2 + %conv14 = zext i16 %6 to i32 + %incdec.ptr15 = getelementptr inbounds i16, i16* %b.addr.0100, i64 3 + %7 = load i16, i16* %incdec.ptr6, align 2 + %conv16 = zext i16 %7 to i32 + %sub17 = sub nsw i32 %conv14, %conv16 + %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = zext i16 %8 to i32 + %add21 = add nsw i32 %add12, %conv20 + %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4 + %9 = load i16, i16* %incdec.ptr13, align 2 + %conv23 = zext i16 %9 to i32 + %incdec.ptr24 = getelementptr inbounds i16, i16* %b.addr.0100, i64 4 + %10 = load i16, i16* %incdec.ptr15, align 2 + %conv25 = zext i16 %10 to i32 + %sub26 = sub nsw i32 %conv23, %conv25 + %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26 + %11 = load i16, i16* %arrayidx28, align 2 + %conv29 = zext i16 %11 to i32 + %add30 = add nsw i32 %add21, %conv29 + %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5 + %12 = load i16, i16* %incdec.ptr22, align 2 + %conv32 = zext i16 %12 to i32 + %incdec.ptr33 = getelementptr inbounds i16, i16* %b.addr.0100, i64 5 + %13 = load i16, i16* %incdec.ptr24, align 2 + %conv34 = zext i16 %13 to i32 + %sub35 = sub nsw i32 %conv32, %conv34 + %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35 + %14 = load i16, i16* %arrayidx37, align 2 + %conv38 = zext i16 %14 to i32 + %add39 = add nsw i32 %add30, %conv38 + %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6 + %15 = load i16, i16* %incdec.ptr31, align 2 + %conv41 = zext i16 %15 to i32 + %incdec.ptr42 = getelementptr inbounds i16, i16* %b.addr.0100, i64 6 + %16 = load i16, i16* %incdec.ptr33, align 2 + %conv43 = zext i16 %16 to i32 + %sub44 = sub nsw i32 %conv41, %conv43 + %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44 + %17 = load i16, i16* %arrayidx46, align 2 + %conv47 = zext i16 %17 to i32 + %add48 = add nsw i32 %add39, %conv47 + %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7 + %18 = load i16, i16* %incdec.ptr40, align 2 + %conv50 = zext i16 %18 to i32 + %incdec.ptr51 = getelementptr inbounds i16, i16* %b.addr.0100, i64 7 + %19 = load i16, i16* %incdec.ptr42, align 2 + %conv52 = zext i16 %19 to i32 + %sub53 = sub nsw i32 %conv50, %conv52 + %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53 + %20 = load i16, i16* %arrayidx55, align 2 + %conv56 = zext i16 %20 to i32 + %add57 = add nsw i32 %add48, %conv56 + %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8 + %21 = load i16, i16* %incdec.ptr49, align 2 + %conv59 = zext i16 %21 to i32 + %incdec.ptr60 = getelementptr inbounds i16, i16* %b.addr.0100, i64 8 + %22 = load i16, i16* %incdec.ptr51, align 2 + %conv61 = zext i16 %22 to i32 + %sub62 = sub nsw i32 %conv59, %conv61 + %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62 + %23 = load i16, i16* %arrayidx64, align 2 + %conv65 = zext i16 %23 to i32 + %add66 = add nsw i32 %add57, %conv65 + %inc = add nuw nsw i32 %i.0103, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; CHECK-LABEL: @gather_reduce_8x16_i64 +; +; CHECK-NOT: load <8 x i16> +; +; FIXME: We are currently unable to vectorize the case with i64 subtraction +; because the zero extensions are too expensive. The solution here is to +; convert the i64 subtractions to i32 subtractions during vectorization. +; This would then match the case above. +; +define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { +entry: + %cmp.99 = icmp sgt i32 %n, 0 + br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ] + %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ] + %b.addr.0100 = phi i16* [ %incdec.ptr60, %for.body ], [ %b, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1 + %0 = load i16, i16* %a.addr.0101, align 2 + %conv = zext i16 %0 to i64 + %incdec.ptr1 = getelementptr inbounds i16, i16* %b.addr.0100, i64 1 + %1 = load i16, i16* %b.addr.0100, align 2 + %conv2 = zext i16 %1 to i64 + %sub = sub nsw i64 %conv, %conv2 + %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub + %2 = load i16, i16* %arrayidx, align 2 + %conv3 = zext i16 %2 to i32 + %add = add nsw i32 %conv3, %sum.0102 + %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2 + %3 = load i16, i16* %incdec.ptr, align 2 + %conv5 = zext i16 %3 to i64 + %incdec.ptr6 = getelementptr inbounds i16, i16* %b.addr.0100, i64 2 + %4 = load i16, i16* %incdec.ptr1, align 2 + %conv7 = zext i16 %4 to i64 + %sub8 = sub nsw i64 %conv5, %conv7 + %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8 + %5 = load i16, i16* %arrayidx10, align 2 + %conv11 = zext i16 %5 to i32 + %add12 = add nsw i32 %add, %conv11 + %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3 + %6 = load i16, i16* %incdec.ptr4, align 2 + %conv14 = zext i16 %6 to i64 + %incdec.ptr15 = getelementptr inbounds i16, i16* %b.addr.0100, i64 3 + %7 = load i16, i16* %incdec.ptr6, align 2 + %conv16 = zext i16 %7 to i64 + %sub17 = sub nsw i64 %conv14, %conv16 + %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = zext i16 %8 to i32 + %add21 = add nsw i32 %add12, %conv20 + %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4 + %9 = load i16, i16* %incdec.ptr13, align 2 + %conv23 = zext i16 %9 to i64 + %incdec.ptr24 = getelementptr inbounds i16, i16* %b.addr.0100, i64 4 + %10 = load i16, i16* %incdec.ptr15, align 2 + %conv25 = zext i16 %10 to i64 + %sub26 = sub nsw i64 %conv23, %conv25 + %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26 + %11 = load i16, i16* %arrayidx28, align 2 + %conv29 = zext i16 %11 to i32 + %add30 = add nsw i32 %add21, %conv29 + %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5 + %12 = load i16, i16* %incdec.ptr22, align 2 + %conv32 = zext i16 %12 to i64 + %incdec.ptr33 = getelementptr inbounds i16, i16* %b.addr.0100, i64 5 + %13 = load i16, i16* %incdec.ptr24, align 2 + %conv34 = zext i16 %13 to i64 + %sub35 = sub nsw i64 %conv32, %conv34 + %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35 + %14 = load i16, i16* %arrayidx37, align 2 + %conv38 = zext i16 %14 to i32 + %add39 = add nsw i32 %add30, %conv38 + %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6 + %15 = load i16, i16* %incdec.ptr31, align 2 + %conv41 = zext i16 %15 to i64 + %incdec.ptr42 = getelementptr inbounds i16, i16* %b.addr.0100, i64 6 + %16 = load i16, i16* %incdec.ptr33, align 2 + %conv43 = zext i16 %16 to i64 + %sub44 = sub nsw i64 %conv41, %conv43 + %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44 + %17 = load i16, i16* %arrayidx46, align 2 + %conv47 = zext i16 %17 to i32 + %add48 = add nsw i32 %add39, %conv47 + %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7 + %18 = load i16, i16* %incdec.ptr40, align 2 + %conv50 = zext i16 %18 to i64 + %incdec.ptr51 = getelementptr inbounds i16, i16* %b.addr.0100, i64 7 + %19 = load i16, i16* %incdec.ptr42, align 2 + %conv52 = zext i16 %19 to i64 + %sub53 = sub nsw i64 %conv50, %conv52 + %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53 + %20 = load i16, i16* %arrayidx55, align 2 + %conv56 = zext i16 %20 to i32 + %add57 = add nsw i32 %add48, %conv56 + %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8 + %21 = load i16, i16* %incdec.ptr49, align 2 + %conv59 = zext i16 %21 to i64 + %incdec.ptr60 = getelementptr inbounds i16, i16* %b.addr.0100, i64 8 + %22 = load i16, i16* %incdec.ptr51, align 2 + %conv61 = zext i16 %22 to i64 + %sub62 = sub nsw i64 %conv59, %conv61 + %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62 + %23 = load i16, i16* %arrayidx64, align 2 + %conv65 = zext i16 %23 to i32 + %add66 = add nsw i32 %add57, %conv65 + %inc = add nuw nsw i32 %i.0103, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +}