Index: lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- lib/Transforms/IPO/PassManagerBuilder.cpp +++ lib/Transforms/IPO/PassManagerBuilder.cpp @@ -285,8 +285,10 @@ if (RerollLoops) MPM.add(createLoopRerollPass()); if (!RunSLPAfterLoopVectorization) { - if (SLPVectorize) + if (SLPVectorize) { MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + MPM.add(createAggressiveDCEPass()); // Delete dead instructions + } if (BBVectorize) { MPM.add(createBBVectorizePass()); @@ -355,6 +357,7 @@ if (RunSLPAfterLoopVectorization) { if (SLPVectorize) { MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + MPM.add(createAggressiveDCEPass()); // Delete dead instructions if (OptLevel > 1 && ExtraVectorizerPasses) { MPM.add(createEarlyCSEPass()); } @@ -496,8 +499,10 @@ // More scalar chains could be vectorized due to more alias information if (RunSLPAfterLoopVectorization) - if (SLPVectorize) + if (SLPVectorize) { PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains. + PM.add(createAggressiveDCEPass()); // Delete dead instructions + } // After vectorization, assume intrinsics may tell us more about pointer // alignments. Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -61,6 +61,15 @@ "number ")); static cl::opt +SLPGather("slp-vectorize-gather", cl::ZeroOrMore, + cl::init(false), cl::Hidden, + cl::desc("Attempt to vectorize insert vector sequence")); +static cl::opt +SLPScatter("slp-vectorize-scatter", cl::ZeroOrMore, + cl::init(false), cl::Hidden, + cl::desc("Attempt to vectorize extract vector sequence")); + +static cl::opt ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); @@ -393,6 +402,7 @@ return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; } + private: struct TreeEntry; @@ -3050,6 +3060,10 @@ struct SLPVectorizer : public FunctionPass { typedef SmallVector StoreList; typedef MapVector StoreListMap; + typedef SmallVector InsertElementList; + typedef MapVector InsertElementListMap; + typedef SmallVector ExtractElementList; + typedef MapVector ExtractElementListMap; /// Pass identification, replacement for typeid static char ID; @@ -3102,6 +3116,22 @@ // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { + // Combine Insert Element Instructions + if (SLPGather) + if (unsigned count = collectInsertElements(BB)) { + (void)count; + DEBUG(dbgs() << "SLP: Found " << count << " insertelement to combine.\n"); + Changed |= combineInsertElementChains(R); + } + + // Combine Extract Element Instructions + if (SLPScatter) + if (unsigned count = collectExtractElements(BB)) { + (void)count; + DEBUG(dbgs() << "SLP: Found " << count << " extractelement to combine.\n"); + Changed |= combineExtractElementChains(R); + } + // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; @@ -3168,8 +3198,25 @@ bool vectorizeStores(ArrayRef Stores, int costThreshold, BoUpSLP &R); + + /// \brief Collect vector insert element instructions that use extended values + /// from a load instruction inserting them into constant vector elements. + unsigned collectInsertElements(BasicBlock *BB); + + /// \brief Combine the vector insert elements collected in InsertElems. + bool combineInsertElementChains(BoUpSLP &R); + + bool combineInsertElementChain(ArrayRef Chain, BoUpSLP &R); + bool combineInsertElements(ArrayRef InsertElements, BoUpSLP &R); + bool combineExtractElementChain(ArrayRef Chain, BoUpSLP &R, Value *&NewExt); + bool combineExtractElements(ArrayRef ExtractElements, BoUpSLP &R, Value *&NewExt); + unsigned collectExtractElements(BasicBlock *BB); + bool combineExtractElementChains(BoUpSLP &R); + private: StoreListMap StoreRefs; + InsertElementListMap InsertElems; + ExtractElementListMap ExtractElems; }; /// \brief Check that the Values in the slice in VL array are still existent in @@ -4004,6 +4051,387 @@ return Changed; } +bool SLPVectorizer::combineInsertElementChain(ArrayRef Chain, + BoUpSLP &R) { + unsigned Len = Chain.size(); + + Instruction *IE = Chain[0]; + Instruction *Ext = dyn_cast(IE->getOperand(1)); + + assert(Ext && " There should be a ext instruction!"); + + // Make the end of chain the insert point. + Instruction *User = dyn_cast(IE); + IRBuilder<> Builder(User); + + // 1 -Create a new undef vector of narrow type but same width (#elements). + Type *EType = Ext->getOperand(0)->getType(); + VectorType *VType = cast(IE->getOperand(0)->getType()); + unsigned Num = VType->getNumElements(); + Value *Vec = UndefValue::get(VectorType::get(EType, Num)); + for (unsigned I = 0; I < Len; ++I) { + // 2 -Replace original vector with new narrow vector. + // 3- replace extended value with the loaded value + // (might need to create a new insertelement instruction + // instead of replacing operands). + IE = Chain[I]; + User = dyn_cast(IE); + Ext = dyn_cast(IE->getOperand(1)); + Builder.SetInsertPoint(User); + Builder.SetCurrentDebugLocation(IE->getDebugLoc()); + assert(Ext && " There should be a ext instruction!"); + Vec = Builder.CreateInsertElement(Vec, Ext->getOperand(0), + IE->getOperand(2)); + } + + // 4- Create vector extend instruction and insert it after the last + // insertelement instruction. + if (Ext->getOpcode() == Instruction::ZExt) + Vec = Builder.CreateZExt(Vec, IE->getOperand(0)->getType()); + else + Vec = Builder.CreateSExt(Vec, IE->getOperand(0)->getType()); + + // 5 - Replace all of use of last IE with Vec. + IE->replaceAllUsesWith(Vec); + + return true; +} + +bool SLPVectorizer::combineExtractElementChain(ArrayRef Chain, + BoUpSLP &R, Value *&NewExt) { + unsigned Len = Chain.size(); + + Instruction *EE = Chain[0]; + Value *V = cast(EE); + Instruction *Ext = dyn_cast(V->user_back()); + + assert(Ext && " There should be a ext instruction!"); + + // Make the last of chain the insert point + IRBuilder<> Builder(EE); + Builder.SetInsertPoint(EE); + Builder.SetCurrentDebugLocation(EE->getDebugLoc()); + + // 1 -Create a new zero/sign extend instruction if not yet + if (NewExt == nullptr) { + Type *EType = Ext->getType(); + VectorType *VType = cast(EE->getOperand(0)->getType()); + unsigned Num = VType->getNumElements(); + VectorType *NType = VectorType::get(EType, Num); + + if (Ext->getOpcode() == Instruction::ZExt) + NewExt = Builder.CreateZExt(EE->getOperand(0), NType); + else + NewExt = Builder.CreateSExt(EE->getOperand(0), NType); + } + + for (unsigned I = 0; I < Len; ++I) { + // 2 - Using the index value, create new extractelement instruction + // from the extended vector created in (1). Keep the same ordering... + // 3 - Replace uses of extracted value. + EE = Chain[I]; + V = cast(EE); + Ext = dyn_cast(V->user_back()); + Builder.SetInsertPoint(Ext); + Builder.SetCurrentDebugLocation(EE->getDebugLoc()); + assert(Ext && " There should be a ext instruction!"); + Value *NVal = Builder.CreateExtractElement(NewExt, EE->getOperand(1)); + Ext->replaceAllUsesWith(NVal); + } + + return true; +} + +bool SLPVectorizer::combineInsertElements( + ArrayRef InsertElementsCandidates, BoUpSLP &R) { + bool Changed = false; + + // TODO: There can be multiple interleaving chains + // embeeded in the candidates. + // We need an extra step to break them up. + + // - All indexes must be accessed + unsigned NumElements = cast( + InsertElementsCandidates[0]->getOperand(0)->getType())->getNumElements(); + + if (InsertElementsCandidates.size() != NumElements) + return false; + + // Checks for specific properties... + // - Indexes: constants. + // - Inserted value: comes from an extended value. + // Either sign or zero extend operation and have one use. + // All extension operations must match types. + // - Extended value: comes from a load. + bool IsSigned = false; + Type *ExtTy = nullptr; + Type *ExtSrcTy = nullptr; + for (unsigned I = 0, E = InsertElementsCandidates.size(); I < E; ++I) { + InsertElementInst *IE = InsertElementsCandidates[I]; + + if (!isa(IE->getOperand(2))) + return false; + + // TODO: use SCEV? + if (!IE->getOperand(1)->hasOneUse()) + return false; + if (!isa(IE->getOperand(1)) && !isa(IE->getOperand(1))) + return false; + Instruction *Ext = cast(IE->getOperand(1)); + if (I == 0) { + IsSigned = isa(Ext) ? true : false; + ExtTy = Ext->getType(); + ExtSrcTy = Ext->getOperand(0)->getType(); + } + else { + if (IsSigned && isa(Ext)) + return false; + if (Ext->getType() != ExtTy) + return false; + if (Ext->getOperand(0)->getType() != ExtSrcTy) + return false; + } + + //TODO: check more load properties? + if (!isa(Ext->getOperand(0))) + return false; + } + + std::map IndexOccurrence; + for (unsigned I = 0; I < InsertElementsCandidates.size(); ++I) { + InsertElementInst *IE = InsertElementsCandidates[I]; + ConstantInt *Idx = cast(IE->getOperand(2)); + ++IndexOccurrence[Idx->getZExtValue()]; + } + for (unsigned I = 0; I < NumElements; ++I) + if (IndexOccurrence[I] != 1) { + DEBUG(dbgs() << "SLP: Properties check index" << I << "failed for insertelement chain \n"); + return false; + } + + DEBUG(dbgs() << "SLP: Properties check successful for insertelement chain\n"); + Changed = combineInsertElementChain(InsertElementsCandidates, R); + + return Changed; +} + +bool SLPVectorizer::combineExtractElements( + ArrayRef ExtractElementsCandidates, BoUpSLP &R, Value *&NewExt) { + bool Changed = false; + + // - All indexes must be accessed if vector is not undef. + // - TODO: This requirement is not needed for extract/ext. + ExtractElementInst *EE = ExtractElementsCandidates[0]; + VectorType *VType = cast(EE->getOperand(0)->getType()); + unsigned NumElements = VType->getNumElements(); + + if (ExtractElementsCandidates.size() != NumElements) + return false; + + // Checks for specific properties... + + // - Indexes: constants. + // - Inserted value: used by either sign or zero extend instruction and + // has one use. + // - All extenstion operations must match types. + // - Extended value: used by a store instruction. + bool IsSigned = false; + Type *ExtTy = nullptr; + Type *ExtSrcTy = nullptr; + for (unsigned I = 0, E = ExtractElementsCandidates.size(); I < E; ++I) { + ExtractElementInst *EE = ExtractElementsCandidates[I]; + + if (!isa(EE->getOperand(1))) + return false; + + // TODO: use SCEV? + Value *V = cast(EE); + if (!V->hasOneUse()) + return false; + + Instruction *Ext = dyn_cast(V->user_back()); + if (!Ext || (!isa(Ext) && !isa(Ext))) + return false; + if (I == 0) { + IsSigned = isa(Ext) ? true : false; + ExtTy = Ext->getType(); + ExtSrcTy = Ext->getOperand(0)->getType(); + } + else { + if (IsSigned && isa(Ext)) + return false; + if (Ext->getType() != ExtTy) + return false; + if (Ext->getOperand(0)->getType() != ExtSrcTy) + return false; + } + + V = cast(Ext); + if (!V->hasOneUse()) + return false; + + Instruction *St = dyn_cast(V->user_back()); + + if (!St || !isa(St)) + return false; + } + + // - All indexes must be accessed if vector is not undef. + // - TODO: This requirement is not needed for extract/ext. + std::map IndexOccurrence; + for (unsigned I = 0; I < ExtractElementsCandidates.size(); ++I) { + ExtractElementInst *EE = ExtractElementsCandidates[I]; + ConstantInt *Idx = cast(EE->getOperand(1)); + ++IndexOccurrence[Idx->getZExtValue()]; + } + for (unsigned I = 0; I < NumElements; ++I) + if (IndexOccurrence[I] != 1) { + DEBUG(dbgs() << "SLP: Properties check index" << I << "failed for insertelement chain \n"); + return false; + } + + DEBUG(dbgs() << "SLP: Properties check successful for insertelement chain\n"); + Changed = combineExtractElementChain(ExtractElementsCandidates, R, NewExt); + + return Changed; +} + +bool SLPVectorizer::combineInsertElementChains(BoUpSLP &R) { + bool Changed = false; + + for (auto &I : InsertElems) { + DEBUG(dbgs() << "SLP: Analyzing an insertelement chain of length " + << (&I)->second.size() << ".\n"); + + // Process each insertelements candidate chain. + Changed |= combineInsertElements(makeArrayRef(&((&I)->second[0]), + (&I)->second.size()), R); + } + return Changed; +} + +bool SLPVectorizer::combineExtractElementChains(BoUpSLP &R) { + bool Changed = false; + for (auto &I : ExtractElems) { + DEBUG(dbgs() << "SLP: Analyzing an extractelement chain of length " + << (&I)->second.size() << ".\n"); + + ExtractElementInst *EE = (&I)->second[0]; + VectorType *VType = cast(EE->getOperand(0)->getType()); + unsigned NumElements = VType->getNumElements(); + Value *NewExt = nullptr; + unsigned Len = (&I)->second.size(); + + for (unsigned CI = 0; CI < Len; CI+=NumElements) { + unsigned Size = std::min(Len - CI, NumElements); + Changed |= combineExtractElements(makeArrayRef(&(&I)->second[CI], + Size), R, NewExt); + } + } + return Changed; +} + + +unsigned SLPVectorizer::collectInsertElements(BasicBlock *BB) { + unsigned Count = 0; + DenseMap Visited; + InsertElems.clear(); + for (auto &IT : *BB) { + InsertElementInst *IE = dyn_cast(&IT); + if (!IE || Visited[IE]) + continue; + + // Find the vector used by the insertelement instruction + // and the instruction that defines it. + // Check if it forms the head of a chain of insertelements + // and collect those insertelements. + // TODO: the chain built can be very long, there can be + // multiple chains embedded in. + Value *VOp = IE->getOperand(0); + Value *VHead = cast(IE); + if (!VHead->hasOneUse()) + continue; + + VectorType *Type = dyn_cast(VOp->getType()); + + if (!Type) + continue; + + DEBUG(dbgs() << "SLP: Found insertelement head of chain.\n"); + Value *V = VHead; + InsertElems[VHead].push_back(IE); + Visited[IE] = true; + Count++; + + while (V->hasOneUse()) { + User *U = V->user_back(); + InsertElementInst *UI = dyn_cast(U); + if (!UI || U->getOperand(0) != V) + break; + if (UI->getParent() != BB) + break; + // Save the insertelements locations. + InsertElems[VHead].push_back(UI); + Visited[UI] = true; + Count++; + V = UI; + } + } + return Count; +} + +unsigned SLPVectorizer::collectExtractElements(BasicBlock *BB) { + unsigned Count = 0; + DenseMap Visited; + ExtractElems.clear(); + for (auto &IT : *BB) { + ExtractElementInst *EE = dyn_cast(&IT); + if (!EE || Visited[EE]) + continue; + + // Find the vector used by the extractelement instruction + // and if the only use is to extend the extracted value. + Value *VHead = EE->getOperand(0); + if (isa(VHead)) + continue; + + Value *V = cast(EE); + if (!V->hasOneUse()) + continue; + + // Make sure the head of the chain appear first, + // because the order of user is random. + // We need this to insert new ext in right place. + ExtractElems[VHead].push_back(EE); + Visited[EE] = true; + Count++; + + for (User *U : VHead->users()) { + ExtractElementInst *UEE = dyn_cast(U); + if (!UEE || Visited[UEE]) + continue; + // TODO + // For users that are extract element that are only used + // once and by an extend operation, then add them to the + // list of VHead. + if (UEE->getParent() != BB) + continue; + V = cast(UEE); + if (!V->hasOneUse()) + continue; + User *UU = V->user_back(); + Instruction *UI = dyn_cast(UU); + if (!UI || (!isa(UI) && !isa(UI))) + continue; + // Save the extractelements locations. + ExtractElems[VHead].push_back(UEE); + Visited[UEE] = true; + Count++; + } + } + return Count; +} + } // end anonymous namespace char SLPVectorizer::ID = 0; Index: test/Transforms/SLPVectorizer/AArch64/combine-extractelement.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/combine-extractelement.ll @@ -0,0 +1,137 @@ +; RUN: opt -S -slp-vectorizer -slp-vectorize-scatter -adce %s | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; TO REMOVE THIS COMMENT: this is the first case we want to address: extract +; all elements of a same vector (in any order). +define void @test1(<8 x i8> %v1, i16* %arrayidx1, i16* %arrayidx2, i16* %arrayidx3, i16* %arrayidx4, +i16* %arrayidx5, i16* %arrayidx6, i16* %arrayidx7, i16* %arrayidx8) { +; CHECK-LABEL: @test1 +; CHECK: %1 = zext <8 x i8> %v1 to <8 x i16> +; CHECK-NEXT: %2 = extractelement <8 x i16> %1, i32 0 +; CHECK-NEXT: %3 = extractelement <8 x i16> %1, i32 1 +; CHECK-NEXT: %4 = extractelement <8 x i16> %1, i32 2 +; CHECK-NEXT: %5 = extractelement <8 x i16> %1, i32 3 +; CHECK-NEXT: %6 = extractelement <8 x i16> %1, i32 4 +; CHECK-NEXT: %7 = extractelement <8 x i16> %1, i32 5 +; CHECK-NEXT: %8 = extractelement <8 x i16> %1, i32 6 +; CHECK-NEXT: %9 = extractelement <8 x i16> %1, i32 7 + + %1 = extractelement <8 x i8> %v1, i32 0 + %conv1 = zext i8 %1 to i16 + %2 = extractelement <8 x i8> %v1, i32 1 + %conv2 = zext i8 %2 to i16 + %3 = extractelement <8 x i8> %v1, i32 2 + %conv3 = zext i8 %3 to i16 + %4 = extractelement <8 x i8> %v1, i32 3 + %conv4 = zext i8 %4 to i16 + %5 = extractelement <8 x i8> %v1, i32 4 + %conv5 = zext i8 %5 to i16 + %6 = extractelement <8 x i8> %v1, i32 5 + %conv6 = zext i8 %6 to i16 + %7 = extractelement <8 x i8> %v1, i32 6 + %conv7 = zext i8 %7 to i16 + %8 = extractelement <8 x i8> %v1, i32 7 + %conv8 = zext i8 %8 to i16 + store i16 %conv1, i16* %arrayidx1 + store i16 %conv2, i16* %arrayidx2 + store i16 %conv3, i16* %arrayidx3 + store i16 %conv4, i16* %arrayidx4 + store i16 %conv5, i16* %arrayidx5 + store i16 %conv6, i16* %arrayidx6 + store i16 %conv7, i16* %arrayidx7 + store i16 %conv8, i16* %arrayidx8 + ret void +} + +; TO REMOVE THIS COMMENT: this is the first case we want to address: +; extract all elements of a same vector (in any order). +define void @test2(<8 x i8> %v1, i16* %arrayidx1, i16* %arrayidx2, + i16* %arrayidx3, i16* %arrayidx4, i16* %arrayidx5, i16* %arrayidx6, + i16* %arrayidx7, i16* %arrayidx8, i16* %arrayidx9, i16* %arrayidx10, +i16* %arrayidx11, i16* %arrayidx12, i16* %arrayidx13, i16* %arrayidx14, +i16* %arrayidx15, i16* %arrayidx16) { +; CHECK-LABEL: @test2 +; CHECK: %1 = zext <8 x i8> %v1 to <8 x i16> +; CHECK-NEXT: %2 = extractelement <8 x i16> %1, i32 0 +; CHECK-NEXT: %3 = extractelement <8 x i16> %1, i32 1 +; CHECK-NEXT: %4 = extractelement <8 x i16> %1, i32 2 +; CHECK-NEXT: %5 = extractelement <8 x i16> %1, i32 3 +; CHECK-NEXT: %6 = extractelement <8 x i16> %1, i32 4 +; CHECK-NEXT: %7 = extractelement <8 x i16> %1, i32 5 +; CHECK-NEXT: %8 = extractelement <8 x i16> %1, i32 6 +; CHECK-NEXT: %9 = extractelement <8 x i16> %1, i32 7 +; CHECK-NEXT: %10 = extractelement <8 x i16> %1, i32 0 +; CHECK-NEXT: %11 = extractelement <8 x i16> %1, i32 1 +; CHECK-NEXT: %12 = extractelement <8 x i16> %1, i32 2 +; CHECK-NEXT: %13 = extractelement <8 x i16> %1, i32 3 +; CHECK-NEXT: %14 = extractelement <8 x i16> %1, i32 4 +; CHECK-NEXT: %15 = extractelement <8 x i16> %1, i32 5 +; CHECK-NEXT: %16 = extractelement <8 x i16> %1, i32 6 +; CHECK-NEXT: %17 = extractelement <8 x i16> %1, i32 7 + + %1 = extractelement <8 x i8> %v1, i32 0 + %conv1 = zext i8 %1 to i16 + %2 = extractelement <8 x i8> %v1, i32 1 + %conv2 = zext i8 %2 to i16 + %3 = extractelement <8 x i8> %v1, i32 2 + %conv3 = zext i8 %3 to i16 + %4 = extractelement <8 x i8> %v1, i32 3 + %conv4 = zext i8 %4 to i16 + %5 = extractelement <8 x i8> %v1, i32 4 + %conv5 = zext i8 %5 to i16 + %6 = extractelement <8 x i8> %v1, i32 5 + %conv6 = zext i8 %6 to i16 + %7 = extractelement <8 x i8> %v1, i32 6 + %conv7 = zext i8 %7 to i16 + %8 = extractelement <8 x i8> %v1, i32 7 + %conv8 = zext i8 %8 to i16 + %9 = extractelement <8 x i8> %v1, i32 0 + %conv9 = zext i8 %9 to i16 + %10 = extractelement <8 x i8> %v1, i32 1 + %conv10 = zext i8 %10 to i16 + %11 = extractelement <8 x i8> %v1, i32 2 + %conv11 = zext i8 %11 to i16 + %12 = extractelement <8 x i8> %v1, i32 3 + %conv12 = zext i8 %12 to i16 + %13 = extractelement <8 x i8> %v1, i32 4 + %conv13 = zext i8 %13 to i16 + %14 = extractelement <8 x i8> %v1, i32 5 + %conv14 = zext i8 %14 to i16 + %15 = extractelement <8 x i8> %v1, i32 6 + %conv15 = zext i8 %15 to i16 + %16 = extractelement <8 x i8> %v1, i32 7 + %conv16 = zext i8 %16 to i16 + store i16 %conv1, i16* %arrayidx1 + store i16 %conv2, i16* %arrayidx2 + store i16 %conv3, i16* %arrayidx3 + store i16 %conv4, i16* %arrayidx4 + store i16 %conv5, i16* %arrayidx5 + store i16 %conv6, i16* %arrayidx6 + store i16 %conv7, i16* %arrayidx7 + store i16 %conv8, i16* %arrayidx8 + store i16 %conv9, i16* %arrayidx9 + store i16 %conv10, i16* %arrayidx10 + store i16 %conv11, i16* %arrayidx11 + store i16 %conv12, i16* %arrayidx12 + store i16 %conv13, i16* %arrayidx13 + store i16 %conv14, i16* %arrayidx14 + store i16 %conv15, i16* %arrayidx15 + store i16 %conv16, i16* %arrayidx16 + ret void +} + + +; TO REMOVE THIS COMMENT: this is the second case we want to address: extract +; some elements of a same vector (in any order). But not sure it is profitable. +define void @test3(<8 x i8> %v1, i16* %arrayidx1, i16* %arrayidx2) { +; CHECK-LABEL: @test3 + %x = extractelement <8 x i8> %v1, i32 0 + %conv1 = zext i8 %x to i16 + %y = extractelement <8 x i8> %v1, i32 1 + %conv2 = zext i8 %y to i16 + store i16 %conv1, i16* %arrayidx1 + store i16 %conv2, i16* %arrayidx2 + ret void +} + Index: test/Transforms/SLPVectorizer/AArch64/combine-insertelement.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/combine-insertelement.ll @@ -0,0 +1,187 @@ +; RUN: opt -S -slp-vectorizer -slp-vectorize-gather -adce %s | FileCheck %s +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; TO REMOVE THIS COMMENT: this is the first case we want to address: build vector +; from undef vector if all indexes are used. +define <8 x i16> @test1(i8* %arrayidx1, i8* %arrayidx2, i8* %arrayidx3, +; CHECK-LABEL: @test1 + i8* %arrayidx4, i8* %arrayidx5, i8* %arrayidx6, i8* %arrayidx7, i8* %arrayidx8) { +; CHECK: %9 = insertelement <8 x i8> undef, i8 %1, i32 0 +; CHECK-NEXT: %10 = insertelement <8 x i8> %9, i8 %2, i32 1 +; CHECK-NEXT: %11 = insertelement <8 x i8> %10, i8 %3, i32 2 +; CHECK-NEXT: %12 = insertelement <8 x i8> %11, i8 %4, i32 3 +; CHECK-NEXT: %13 = insertelement <8 x i8> %12, i8 %5, i32 4 +; CHECK-NEXT: %14 = insertelement <8 x i8> %13, i8 %6, i32 5 +; CHECK-NEXT: %15 = insertelement <8 x i8> %14, i8 %7, i32 6 +; CHECK-NEXT: %16 = insertelement <8 x i8> %15, i8 %8, i32 7 +; CHECK-NEXT: %17 = zext <8 x i8> %16 to <8 x i16> + + %1 = load i8, i8* %arrayidx1 + %conv1 = zext i8 %1 to i16 + %2 = load i8, i8* %arrayidx2 + %conv2 = zext i8 %2 to i16 + %3 = load i8, i8* %arrayidx3 + %conv3 = zext i8 %3 to i16 + %4 = load i8, i8* %arrayidx4 + %conv4 = zext i8 %4 to i16 + %5 = load i8, i8* %arrayidx5 + %conv5 = zext i8 %5 to i16 + %6 = load i8, i8* %arrayidx6 + %conv6 = zext i8 %6 to i16 + %7 = load i8, i8* %arrayidx7 + %conv7 = zext i8 %7 to i16 + %8 = load i8, i8* %arrayidx8 + %conv8 = zext i8 %8 to i16 + %x0 = insertelement <8 x i16> undef, i16 %conv1, i32 0 + %x1 = insertelement <8 x i16> %x0, i16 %conv2, i32 1 + %x2 = insertelement <8 x i16> %x1, i16 %conv3, i32 2 + %x3 = insertelement <8 x i16> %x2, i16 %conv4, i32 3 + %x4 = insertelement <8 x i16> %x3, i16 %conv5, i32 4 + %x5 = insertelement <8 x i16> %x4, i16 %conv6, i32 5 + %x6 = insertelement <8 x i16> %x5, i16 %conv7, i32 6 + %x7 = insertelement <8 x i16> %x6, i16 %conv8, i32 7 + ret <8 x i16> %x7 +} + +; TO REMOVE THIS COMMENT: this is the second case we want to address: +; build vector from a previously defined vector only if all indexes are used. +define <8 x i16> @test2(i8* %arrayidx1, i8* %arrayidx2, i8* %arrayidx3, +; CHECK-LABEL: @test2 + i8* %arrayidx4, i8* %arrayidx5, i8* %arrayidx6, i8* %arrayidx7, i8* %arrayidx8, <8 x i16> %x) { +; CHECK: %9 = insertelement <8 x i8> undef, i8 %1, i32 0 +; CHECK-NEXT: %10 = insertelement <8 x i8> %9, i8 %2, i32 1 +; CHECK-NEXT: %11 = insertelement <8 x i8> %10, i8 %3, i32 2 +; CHECK-NEXT: %12 = insertelement <8 x i8> %11, i8 %4, i32 3 +; CHECK-NEXT: %13 = insertelement <8 x i8> %12, i8 %5, i32 4 +; CHECK-NEXT: %14 = insertelement <8 x i8> %13, i8 %6, i32 5 +; CHECK-NEXT: %15 = insertelement <8 x i8> %14, i8 %7, i32 6 +; CHECK-NEXT: %16 = insertelement <8 x i8> %15, i8 %8, i32 7 +; CHECK-NEXT: %17 = zext <8 x i8> %16 to <8 x i16> + + %1 = load i8, i8* %arrayidx1 + %conv1 = zext i8 %1 to i16 + %2 = load i8, i8* %arrayidx2 + %conv2 = zext i8 %2 to i16 + %3 = load i8, i8* %arrayidx3 + %conv3 = zext i8 %3 to i16 + %4 = load i8, i8* %arrayidx4 + %conv4 = zext i8 %4 to i16 + %5 = load i8, i8* %arrayidx5 + %conv5 = zext i8 %5 to i16 + %6 = load i8, i8* %arrayidx6 + %conv6 = zext i8 %6 to i16 + %7 = load i8, i8* %arrayidx7 + %conv7 = zext i8 %7 to i16 + %8 = load i8, i8* %arrayidx8 + %conv8 = zext i8 %8 to i16 + %x0 = insertelement <8 x i16> %x, i16 %conv1, i32 0 + %x1 = insertelement <8 x i16> %x0, i16 %conv2, i32 1 + %x2 = insertelement <8 x i16> %x1, i16 %conv3, i32 2 + %x3 = insertelement <8 x i16> %x2, i16 %conv4, i32 3 + %x4 = insertelement <8 x i16> %x3, i16 %conv5, i32 4 + %x5 = insertelement <8 x i16> %x4, i16 %conv6, i32 5 + %x6 = insertelement <8 x i16> %x5, i16 %conv7, i32 6 + %x7 = insertelement <8 x i16> %x6, i16 %conv8, i32 7 + ret <8 x i16> %x7 +} + + +; TO REMOVE THIS COMMENT: make sure we can capture several chains. +define <8 x i16> @test3(i8* %arrayidx1, i8* %arrayidx2, i8* %arrayidx3, +; CHECK-LABEL: @test3 + i8* %arrayidx4, i8* %arrayidx5, i8* %arrayidx6, i8* %arrayidx7, i8* %arrayidx8) { +; CHECK: %9 = insertelement <8 x i8> undef, i8 %1, i32 0 +; CHECK-NEXT: %10 = insertelement <8 x i8> %9, i8 %2, i32 1 +; CHECK-NEXT: %11 = insertelement <8 x i8> %10, i8 %3, i32 2 +; CHECK-NEXT: %12 = insertelement <8 x i8> %11, i8 %4, i32 3 +; CHECK-NEXT: %13 = insertelement <8 x i8> %12, i8 %5, i32 4 +; CHECK-NEXT: %14 = insertelement <8 x i8> %13, i8 %6, i32 5 +; CHECK-NEXT: %15 = insertelement <8 x i8> %14, i8 %7, i32 6 +; CHECK-NEXT: %16 = insertelement <8 x i8> %15, i8 %8, i32 7 +; CHECK-NEXT: %17 = zext <8 x i8> %16 to <8 x i16> + + %1 = load i8, i8* %arrayidx1 + %conv1 = zext i8 %1 to i16 + %2 = load i8, i8* %arrayidx2 + %conv2 = zext i8 %2 to i16 + %3 = load i8, i8* %arrayidx3 + %conv3 = zext i8 %3 to i16 + %4 = load i8, i8* %arrayidx4 + %conv4 = zext i8 %4 to i16 + %5 = load i8, i8* %arrayidx5 + %conv5 = zext i8 %5 to i16 + %6 = load i8, i8* %arrayidx6 + %conv6 = zext i8 %6 to i16 + %7 = load i8, i8* %arrayidx7 + %conv7 = zext i8 %7 to i16 + %8 = load i8, i8* %arrayidx8 + %conv8 = zext i8 %8 to i16 + %x0 = insertelement <8 x i16> undef, i16 %conv1, i32 0 + %x1 = insertelement <8 x i16> %x0, i16 %conv2, i32 1 + %x2 = insertelement <8 x i16> %x1, i16 %conv3, i32 2 + %x3 = insertelement <8 x i16> %x2, i16 %conv4, i32 3 + %x4 = insertelement <8 x i16> %x3, i16 %conv5, i32 4 + %x5 = insertelement <8 x i16> %x4, i16 %conv6, i32 5 + %x6 = insertelement <8 x i16> %x5, i16 %conv7, i32 6 + %x7 = insertelement <8 x i16> %x6, i16 %conv8, i32 7 + +; CHECK: %18 = insertelement <8 x i8> undef, i8 %1, i32 0 +; CHECK-NEXT: %19 = insertelement <8 x i8> %18, i8 %2, i32 1 +; CHECK-NEXT: %20 = insertelement <8 x i8> %19, i8 %3, i32 2 +; CHECK-NEXT: %21 = insertelement <8 x i8> %20, i8 %4, i32 3 +; CHECK-NEXT: %22 = insertelement <8 x i8> %21, i8 %5, i32 4 +; CHECK-NEXT: %23 = insertelement <8 x i8> %22, i8 %6, i32 5 +; CHECK-NEXT: %24 = insertelement <8 x i8> %23, i8 %7, i32 6 +; CHECK-NEXT: %25 = insertelement <8 x i8> %24, i8 %8, i32 7 +; CHECK-NEXT: %26 = zext <8 x i8> %25 to <8 x i16> + + %conv1y = zext i8 %1 to i16 + %conv2y = zext i8 %2 to i16 + %conv3y = zext i8 %3 to i16 + %conv4y = zext i8 %4 to i16 + %conv5y = zext i8 %5 to i16 + %conv6y = zext i8 %6 to i16 + %conv7y = zext i8 %7 to i16 + %conv8y = zext i8 %8 to i16 + + %y0 = insertelement <8 x i16> undef, i16 %conv1y, i32 0 + %y1 = insertelement <8 x i16> %y0, i16 %conv2y, i32 1 + %y2 = insertelement <8 x i16> %y1, i16 %conv3y, i32 2 + %y3 = insertelement <8 x i16> %y2, i16 %conv4y, i32 3 + %y4 = insertelement <8 x i16> %y3, i16 %conv5y, i32 4 + %y5 = insertelement <8 x i16> %y4, i16 %conv6y, i32 5 + %y6 = insertelement <8 x i16> %y5, i16 %conv7y, i32 6 + %y7 = insertelement <8 x i16> %y6, i16 %conv8y, i32 7 + + %z = add <8 x i16> %x7, %y7 + ret <8 x i16> %z +} + + +; TO REMOVE THIS COMMENT: this is the third case we want to address: +; build vector from undef vector no matter how many indexes are accessed. +define <8 x i16> @test4(i8* %arrayidx1, i8* %arrayidx2) { +; CHECK-LABEL: @test4 + + %1 = load i8, i8* %arrayidx1 + %conv1 = zext i8 %1 to i16 + %x = insertelement <8 x i16> undef, i16 %conv1, i32 5 + ret <8 x i16> %x +} + +; TO REMOVE THIS COMMENT: similar to above just more entries, +; this is the fourth case we want to address: +; build vector from undef vector no matter how many indexes are accessed. +define <8 x i16> @test5(i8* %arrayidx1, i8* %arrayidx2) { +; CHECK-LABEL: @test5 + + %1 = load i8, i8* %arrayidx1 + %conv1 = zext i8 %1 to i16 + %2 = load i8, i8* %arrayidx2 + %conv2 = zext i8 %2 to i16 + %x = insertelement <8 x i16> undef, i16 %conv1, i32 5 + %y = insertelement <8 x i16> %x, i16 %conv2, i32 1 + ret <8 x i16> %y +} +