Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -237,6 +237,10 @@ SmallVector getInstructionsForAccess(Value *Ptr, bool isWrite) const; + /// \brief Check whether the data dependence could prevent store-load + /// forwarding. + bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); + private: ScalarEvolution *SE; const Loop *InnermostLoop; @@ -286,10 +290,220 @@ Dependence::DepType isDependent(const MemAccessInfo &A, unsigned AIdx, const MemAccessInfo &B, unsigned BIdx, const ValueToValueMap &Strides); +}; - /// \brief Check whether the data dependence could prevent store-load - /// forwarding. - bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize); +/// \brief The group of interleaved loads/stores sharing the same stride and +/// close to each other. +/// +/// Each member in this group has an index starting from 0 and the largest +/// index should never be greater than Delta (interleaved factor) which is the +/// absolute value of the access stride. +/// +/// E.g. Interleaved load group of Delta 4 (with a gap: index 2): +/// for (unsigned i = 0; i < 1024; i+=4) { +/// a = A[i]; // Member of index 0 +/// b = A[i+1]; // Member of index 1 +/// d = A[i+3]; // Member of index 3 +/// ... +/// } +/// +/// Interleaved store group of Delta 4: +/// for (unsigned i = 0; i < 1024; i+=4) { +/// ... +/// A[i] = a; // Member of index 0 +/// A[i+1] = b; // Member of index 1 +/// A[i+2] = c; // Member of index 2 +/// A[i+3] = d; // Member of index 3 +/// } +/// +/// Note: the interleaved load group could have gaps (missing members), but +/// the interleaved store group doesn't allow gap. +class InterleaveGroup { +public: + InterleaveGroup(Instruction *Instr, int Stride, unsigned Align) + : Align(Align), SmallestKey(0), LargestKey(0), InsertPos(nullptr) { + assert(Align && "The alignment should be non-zero"); + + Delta = std::abs(Stride); + assert(Delta > 1 && "Invalid interleave factor"); + + Reverse = Stride < 0; + Members[0] = Instr; + } + + bool isReverse() const { return Reverse; } + unsigned getDelta() const { return Delta; } + unsigned getAlign() const { return Align; } + unsigned getNumMembers() const { return Members.size(); } + + /// \brief Try to insert a new member \p Instr with index \p Index and + /// alignment \p NewAlign. + /// + /// \returns false if the instruction doesn't belong to the group. + bool insertMember(Instruction *Instr, int Index, unsigned NewAlign) { + assert(NewAlign && "The new member's alignment should be non-zero"); + + int Key = Index + SmallestKey; + + // Skip if there is already a member with the same index. + if (Members.count(Key)) + return false; + + if (Key > LargestKey) { + // The largest index is always less than the Delta. + if (Index >= Delta) + return false; + + LargestKey = Key; + } else if (Key < SmallestKey) { + // The largest index is always less than the Delta. + if (LargestKey - Key >= Delta) + return false; + + SmallestKey = Key; + } + + // It's always safe to select the minimum alignment. + Align = std::min(Align, NewAlign); + Members[Key] = Instr; + return true; + } + + /// \brief Get the member with the given index \p Index + /// + /// \returns nullptr if dosn't contain such member. + Instruction *getMember(unsigned Index) const { + int Key = SmallestKey + Index; + if (!Members.count(Key)) + return nullptr; + + return Members.find(Key)->second; + } + + /// \brief Get the index for the given member. Unlike the key in the member + /// map, the index starts from 0. + unsigned getIndex(Instruction *Instr) const { + for (auto I : Members) + if (I.second == Instr) + return I.first - SmallestKey; + + llvm_unreachable("InterleaveGroup contains no such member"); + } + + Instruction *getInsertPos() const { return InsertPos; } + void setInsertPos(Instruction *Inst) { InsertPos = Inst; } + + /// \brief Print the group. + void print(raw_ostream &OS, unsigned Depth) const { + OS.indent(Depth) << "Interleave Group of Delta " << Delta << ":\n"; + + for (int i = 0; i < Delta; i++) { + Instruction *Member = getMember(i); + if (Member) + OS.indent(Depth + 2) << "Index " << i << ":" << *Member << "\n"; + } + } + +private: + int Delta; // Interleave Factor. + bool Reverse; + unsigned Align; + DenseMap Members; + int SmallestKey; + int LargestKey; + + // To avoid breaking dependences, an interleaved access should be inserted + // at either the first load or the last store in program order. + // E.g. %even = load i32 // Insert Position + // %add = add i32 %even // Use of %even + // %odd = load i32 + // + // store i32 %even + // %odd = add i32 // Def of %odd + // store i32 %odd // Insert Position + Instruction *InsertPos; +}; + +/// \brief Drive the analysis of interleaved memory accesses in the loop. +/// +/// Call this class to analyze interleaved accesses only when the memory +/// dependence check says we can vectorize the loop. Otherwise it's meaningless +/// to do analysis as the vectorization on interleaved accesses is unsafe. +/// +/// The analysis collects interleave groups and records the relationships +/// between the member and the group in a map. +class InterleavedAccessInfo { +public: + InterleavedAccessInfo(ScalarEvolution *SE, Loop *L, DominatorTree *DT, + MemoryDepChecker *DepChecker) + : SE(SE), TheLoop(L), DT(DT), DepChecker(DepChecker) {} + + ~InterleavedAccessInfo() { + SmallSet DelSet; + // Collecting all the group pointers to avoid release a pointer twice. + for (auto &I : InterleaveGroupMap) + DelSet.insert(I.second); + for (auto *Ptr : DelSet) + delete Ptr; + } + + /// \brief Analyze the interleaved accesses. Substitute symbolic strides + /// using \p Strides. + void analyzeInterleaving(const ValueToValueMap &Strides); + + /// \brief Check if \p Instr belongs to any interleave group. + bool isAccessInterleaved(Instruction *Instr) const { + return InterleaveGroupMap.count(Instr); + } + + /// \brief Get the interleave group that \p Instr belongs to. + /// + /// \returns nullptr if doesn't have such group. + InterleaveGroup *getInterleaveGroup(Instruction *Instr) const { + if (InterleaveGroupMap.count(Instr)) + return InterleaveGroupMap.find(Instr)->second; + return nullptr; + } + + /// \brief Print the interleave groups. + void print(raw_ostream &OS, unsigned Depth) const; + +private: + ScalarEvolution *SE; + Loop *TheLoop; + DominatorTree *DT; + MemoryDepChecker *DepChecker; + + /// Contains the relationships between the members and the interleave group. + DenseMap InterleaveGroupMap; + + /// \brief Group the interleaved loads/stores. Record the write-write pairs + /// whose dependences may be broken by the vectorization. + void groupInterleavedAccesses( + ArrayRef InstrList, + SmallVector, 2> &WritePairs, + const ValueToValueMap &Strides); + + /// \brief Create a new interleave group with the given instruction \p Instr, + /// stride \p Stride and alignment \p Align. + /// + /// \returns the newly created interleave group. + InterleaveGroup *createInterleaveGroup(Instruction *Instr, int Stride, + unsigned Align) { + assert(!InterleaveGroupMap.count(Instr) && + "Already in an interleaved access group"); + InterleaveGroupMap[Instr] = new InterleaveGroup(Instr, Stride, Align); + return InterleaveGroupMap[Instr]; + } + + /// \brief Release the group and remove all the relationships. + void releaseGroup(InterleaveGroup *Group) { + for (unsigned i = 0; i < Group->getDelta(); i++) + if (Instruction *Member = Group->getMember(i)) + InterleaveGroupMap.erase(Member); + + delete Group; + } }; /// \brief Drive the analysis of memory accesses in the loop @@ -306,6 +520,9 @@ /// generates run-time checks to prove independence. This is done by /// AccessAnalysis::canCheckPtrAtRT and the checks are maintained by the /// RuntimePointerCheck class. +/// +/// If the memory accesses can be vectorized, it will analyze the interleaved +/// access information, which is delegated to the InterleavedAccessInfo class. class LoopAccessInfo { public: /// This struct holds information about the memory runtime legality check that @@ -427,6 +644,16 @@ return DepChecker.getInstructionsForAccess(Ptr, isWrite); } + /// \brief Check if \p Instr belongs to any interleave group. + bool isAccessInterleaved(Instruction *Instr) const { + return InterleaveInfo.isAccessInterleaved(Instr); + } + + /// \brief Get the interleave group that \p Instr belongs to. + const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) const { + return InterleaveInfo.getInterleaveGroup(Instr); + } + /// \brief Print the information about the memory accesses in the loop. void print(raw_ostream &OS, unsigned Depth = 0) const; @@ -460,6 +687,10 @@ /// loop-independent and loop-carried dependences between memory accesses. MemoryDepChecker DepChecker; + /// \brief The interleaved access information contains groups of interleaved + /// accesses with the same stride and close to each other. + InterleavedAccessInfo InterleaveInfo; + /// \brief Number of memchecks required to prove independence of otherwise /// may-alias pointers unsigned NumComparisons; Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -444,6 +444,18 @@ unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of the interleaved memory operation. + /// \p Opcode is the memory operation code + /// \p VecTy is the vector type of the interleaved access. + /// \p Delta is the interleave factor + /// \p Indices is the indices for interleaved load members (as interleaved + /// loads allow gaps) + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Delta, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) const; + /// \brief Calculate the cost of performing a vector reduction. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar @@ -582,6 +594,11 @@ virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; + virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Delta, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) = 0; virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, @@ -740,6 +757,14 @@ unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Delta, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) override { + return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Delta, Indices, + Alignment, AddressSpace); + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -300,6 +300,14 @@ return 1; } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Delta, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + return 1; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) { return 1; Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -522,6 +522,75 @@ return Cost; } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Delta, + ArrayRef Indices, + unsigned Alignment, + unsigned AddressSpace) { + VectorType *VT = dyn_cast(VecTy); + assert(VT && "Expect vector types"); + + unsigned NumElts = VT->getNumElements(); + assert(Delta >= 2 && !(NumElts % Delta) && "Invalid Delta"); + + unsigned NumSubElts = NumElts / Delta; + VectorType *SubVT = VectorType::get(VT->getElementType(), NumSubElts); + + // Firstly, the cost of load/store operation. + unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); + + // Then plus the cost of interleave operation. + if (Opcode == Instruction::Load) { + // The interleave cost is similar to extract elements of the sub vectors + // from the wide vector, and insert them into sub vectors. + // + // E.g. Interleaved load of Delta 3 to 2 sub vectors (%v0, %v2): + // %vec = load <12 x i32>, <12 x i32> %ptr + // %v0 = shuffle %vec, undef, <0, 3, 6, 9> ; Index 0 + // %v2 = shuffle %vec, undef, <2, 5, 8, 11> ; Index 2 + // The cost is estimated as extract elements at 0, 2, 3, 5, 6, 8, 9, 11 + // from the <12 x i32> vector and insert them into two <4 x i32> vectors. + + assert(Indices.size() <= Delta && + "Interleaved memory op has too many members"); + for (unsigned Index : Indices) { + assert(Index < Delta && "Invalid index for interleaved memory op"); + for (unsigned i = 0; i < NumSubElts; i++) + Cost += + getVectorInstrCost(Instruction::ExtractElement, VT, Index + i); + } + + unsigned InsSubCost = 0; + for (unsigned i = 0; i < NumSubElts; i++) + InsSubCost += getVectorInstrCost(Instruction::InsertElement, SubVT, i); + + Cost += Indices.size() * InsSubCost; + } else { + // The interleave cost is extract each element from sub vectors, and + // insert them into the wide vector. + // + // E.g. Interleaved store with Delta 3 (For vector %v0, %v1, %v2): + // %v0_v1 = shuffle %v0, %v1, <0, 1, 2, 3, 4, 5, 6, 7> + // %v2_u = shuffle %v2, undef, <0, 1, 2, 3, u, u, u, u> + // %interleaved.vec = shuffle %v0_v1, %v2_u, + // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> + // store <12 x i32> %interleaved.vec, <12 x i32>* %ptr + // The cost is estimated as extract all elements from the 3 <4 x i32> + // vectors (Total 3 * 4 elements) and insert into the <12 x i32> vector. + + unsigned ExtSubCost = 0; + for (unsigned i = 0; i < NumSubElts; i++) + ExtSubCost += getVectorInstrCost(Instruction::ExtractElement, SubVT, i); + + Cost += Delta * ExtSubCost; + + for (unsigned i = 0; i < NumElts; i++) + Cost += getVectorInstrCost(Instruction::InsertElement, VT, i); + } + + return Cost; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys) { unsigned ISD = 0; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -41,6 +41,10 @@ VectorizerParams::VectorizationInterleave)); unsigned VectorizerParams::VectorizationInterleave; +static cl::opt EnableInterleaving( + "enable-interleaving", cl::init(false), cl::Hidden, + cl::desc("Enable analyzing interleaved accesses in a loop")); + static cl::opt RuntimeMemoryCheckThreshold( "runtime-memory-check-threshold", cl::Hidden, cl::desc("When performing memory disambiguation checks at runtime do not " @@ -783,21 +787,32 @@ VectorizerParams::VectorizationFactor : 1); unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? VectorizerParams::VectorizationInterleave : 1); + // The number of iterations to be vectorized and unrolled. + unsigned NumIter = std::max(ForcedFactor * ForcedUnroll, 2U); + + unsigned Stride = std::abs(StrideAPtr); + // Safe when either one below is true + // Distance < Stride * TypeByteSize + // Distance >= TypeByteSize * NumIter * Stride + if (!(Distance < Stride * TypeByteSize || + Distance >= TypeByteSize * NumIter * Stride)) { + DEBUG(dbgs() << "LAA: Failure because of positive distance " + << Val.getSExtValue() << '\n'); + return Dependence::Backward; + } - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { - DEBUG(dbgs() << "LAA: Failure because of Positive distance " - << Val.getSExtValue() << '\n'); + // Safe when positive distance is not greater than the max safe distance. + if (Distance > MaxSafeDepDistBytes) { + DEBUG(dbgs() << "LAA: Failure because positive distance " + << Val.getSExtValue() << " is greater than max safe distance " + << MaxSafeDepDistBytes << "\n"); return Dependence::Backward; } - // Positive distance bigger than max vectorization factor. - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; + // If Distance < Stride * TypeByteSize, it is always safe. just keep the + // current MaxSafeDepDistBytes. Otherwise, update the MaxSafeDepDistBytes. + if (Distance >= Stride * TypeByteSize) + MaxSafeDepDistBytes = Distance; bool IsTrueDataDependence = (!AIsWrite && BIsWrite); if (IsTrueDataDependence && @@ -900,6 +915,241 @@ OS.indent(Depth + 2) << *Instrs[Destination] << "\n"; } +struct StrideDescriptor { + StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, unsigned Align) + : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} + + int Stride; + const SCEV *Scev; + unsigned Size; + unsigned Align; +}; + +void InterleavedAccessInfo::groupInterleavedAccesses( + ArrayRef InstrList, + SmallVector, 2> &WritePairs, + const ValueToValueMap &Strides) { + // Holds all the stride accesses. + SmallVector, 16> StrideAccess; + + auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + for (auto I : InstrList) { + LoadInst *LI = dyn_cast(I); + StoreInst *SI = dyn_cast(I); + assert((LI || SI) && "Invalid access instruction"); + + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); + + // Only analyze non-unit stride accesses. + if (std::abs(Stride) < 2) + continue; + + const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); + PointerType *PtrTy = dyn_cast(Ptr->getType()); + unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); + + // An alignment of 0 means target ABI alignment. + unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); + if (!Align) + Align = DL.getABITypeAlignment(PtrTy->getElementType()); + + StrideAccess.push_back( + std::make_pair(I, StrideDescriptor(Stride, Scev, Size, Align))); + } + + if (!StrideAccess.size()) + return; + + // Search the load-load/write-write pair B-A in bottom-up order and try to + // insert B into the interleave group of A according to 3 rules: + // 1. A and B have the same stride. + // 2. A and B have the same memory object size. + // 3. The distance of B to the leader of the group is less than the Delta. + // + // The bottom-up order can avoid breaking the WAW dependences between two + // pointers with the same base. + // E.g. A[i] = a; (1) + // A[i] = b; (2) + // A[i+1] = c (3) + // We form the group (2)+(3), so that (1) has to form groups with access + // above (1), which guarantees that (1) is always above (2). + for (auto I = StrideAccess.rbegin(), E = StrideAccess.rend(); I != E; ++I) { + Instruction *A = I->first; + StrideDescriptor DesA = I->second; + + InterleaveGroup *Group = getInterleaveGroup(A); + if (!Group) { + DEBUG(dbgs() << "LAA: Creating an interleave group with:" << *A << '\n'); + Group = createInterleaveGroup(A, DesA.Stride, DesA.Align); + } + + for (auto II = std::next(I); II != E; ++II) { + Instruction *B = II->first; + StrideDescriptor DesB = II->second; + + if (A->mayReadFromMemory() != B->mayReadFromMemory()) + continue; + + // Check the rule 1 and 2. + if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size) + continue; + + // Calculate the distance and prepare for the rule 3. + const SCEVConstant *DistToA = + dyn_cast(SE->getMinusSCEV(DesB.Scev, DesA.Scev)); + if (!DistToA) + continue; + + int DistanceToA = DistToA->getValue()->getValue().getSExtValue(); + // Read/write the same location. + if (!DistanceToA) + continue; + + int Size = static_cast(DesA.Size); + if (DistanceToA % Size) { + // Record the write-write pairs that could be broken after vectorizaton. + // + // E.g. char *Tmp = (char *)A; // A is int* + // int *B = (int *) (++Tmp); + // for(i = 0; i < n; i+=2) { + // A[i] = a; // (1) + // B[i] = b; // (2) + // A[i+1] = c; // (3) + // } + // The combine of (1) and (3) will be inserted below (2). + if (A->mayWriteToMemory()) + // Reuse the store-load forwarding check for whether there is overlap. + if (DepChecker->couldPreventStoreLoadForward(std::abs(DistanceToA), + Size)) + WritePairs.push_back(std::make_pair(B, A)); + continue; + } + + // Skip if B is already in a Group. + if (getInterleaveGroup(B)) + continue; + + // The index of B is the index of A plus the related index to A. + int IndexB = Group->getIndex(A) + DistanceToA / Size; + + // Try to insert B into the group. + if (Group->insertMember(B, IndexB, DesB.Align)) { + DEBUG(dbgs() << "LAA: Inserted:" << *B << '\n' + << " into the interleave group with" << *A << '\n'); + InterleaveGroupMap[B] = Group; + } + } + } +} + +void InterleavedAccessInfo::analyzeInterleaving( + const ValueToValueMap &Strides) { + // Holds load/store instructions in program order. + SmallVector InstrList; + + for (auto BB = TheLoop->block_begin(), BE = TheLoop->block_end(); BB != BE; + ++BB) { + bool IsPred = LoopAccessInfo::blockNeedsPredication(*BB, TheLoop, DT); + + for (auto I = (*BB)->begin(), E = (*BB)->end(); I != E; ++I) { + if (!isa(I) && !isa(I)) + continue; + // FIXME: As currently we can't handle predicated access, return directly. + if (IsPred) + return; + + InstrList.push_back(I); + } + } + + if (!InstrList.size()) + return; + + // Holds the write-write pairs that could be broken by the vectorization. + SmallVector, 2> WritePairs; + + DEBUG(dbgs() << "LAA: Analyzing interleaved accesses...\n"); + groupInterleavedAccesses(InstrList, WritePairs, Strides); + + // Filter eligible groups and set the insert position. We only keep the load + // group that has small gaps (less than half Delta) and the fully interleaved + // store group (has no gap). + for (auto I : InstrList) { + InterleaveGroup *Group = getInterleaveGroup(I); + if (!Group) + continue; + + if (I->mayReadFromMemory()) { + if (Group->getNumMembers() >= Group->getDelta() / 2) { + // Choose the first load in program order. + if (!Group->getInsertPos()) + Group->setInsertPos(I); + continue; + } + } else if (Group->getNumMembers() == Group->getDelta()) { + // Choose the last store in program order. + Group->setInsertPos(I); + continue; + } + + releaseGroup(Group); + } + + // Make sure the dependence of each write-write pair is still safe. + for (auto Pair : WritePairs) { + Instruction *A = Pair.first; + Instruction *B = Pair.second; + auto GroupA = getInterleaveGroup(A); + auto GroupB = getInterleaveGroup(B); + + if (!GroupA && !GroupB) + continue; + + Instruction *PosA = GroupA ? GroupA->getInsertPos() : A; + Instruction *PosB = GroupB ? GroupB->getInsertPos() : B; + + // Continue if neither A nor B will be moved to another position. + if (PosA == A && PosB == B) + continue; + + bool IsSafe = true; + + for (auto I = InstrList.rbegin(), E = InstrList.rend(); I != E; ++I) { + if (*I == PosB) + break; + // If the position of A is after the position of B, it is unsafe. + if (*I == PosA) { + IsSafe = false; + break; + } + } + + if (IsSafe) + continue; + + if (PosA != A) + releaseGroup(GroupA); + if (PosB != B) + releaseGroup(GroupB); + } +} + +void InterleavedAccessInfo::print(raw_ostream &OS, unsigned Depth) const { + OS.indent(Depth) << "Interleaved access groups:\n"; + + SmallSet Seen; + for (auto I : InterleaveGroupMap) { + if (Seen.count(I.second)) + continue; + + // Print the group. + I.second->print(OS, Depth + 2); + OS << "\n"; + Seen.insert(I.second); + } +} + bool LoopAccessInfo::canAnalyzeLoop() { // We need to have a loop header. DEBUG(dbgs() << "LAA: Found a loop: " << @@ -1319,12 +1569,15 @@ const TargetLibraryInfo *TLI, AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, const ValueToValueMap &Strides) - : DepChecker(SE, L), NumComparisons(0), TheLoop(L), SE(SE), DL(DL), - TLI(TLI), AA(AA), DT(DT), LI(LI), NumLoads(0), NumStores(0), - MaxSafeDepDistBytes(-1U), CanVecMem(false), - StoreToLoopInvariantAddress(false) { + : DepChecker(SE, L), InterleaveInfo(SE, L, DT, &DepChecker), + NumComparisons(0), TheLoop(L), SE(SE), DL(DL), TLI(TLI), AA(AA), DT(DT), + LI(LI), NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1U), + CanVecMem(false), StoreToLoopInvariantAddress(false) { if (canAnalyzeLoop()) analyzeLoop(Strides); + + if (CanVecMem && EnableInterleaving) + InterleaveInfo.analyzeInterleaving(Strides); } void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { @@ -1354,6 +1607,9 @@ OS.indent(Depth) << "Store to invariant address was " << (StoreToLoopInvariantAddress ? "" : "not ") << "found in loop.\n"; + + InterleaveInfo.print(OS, Depth); + OS << "\n"; } const LoopAccessInfo & Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -235,6 +235,13 @@ return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } +unsigned TargetTransformInfo::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, unsigned Delta, ArrayRef Indices, + unsigned Alignment, unsigned AddressSpace) const { + return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Delta, Indices, + Alignment, AddressSpace); +} + unsigned TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const { Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -34,6 +34,10 @@ // Variable uniformity checks are inspired by: // Karrenberg, R. and Hack, S. Whole Function Vectorization. // +// The interleaved access vectorization is based on the paper: +// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved +// Data for SIMD +// // Other ideas/concepts are from: // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. // @@ -351,6 +355,9 @@ /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); + /// Try to vectorize the interleaved access group that \p Instr belongs to. + void vectorizeInterleaveGroup(Instruction *Instr); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -693,8 +700,16 @@ return LAI->getRuntimePointerCheck(); } - const LoopAccessInfo *getLAI() const { - return LAI; + const LoopAccessInfo *getLAI() const { return LAI; } + + /// \brief Check if \p Instr belongs to any interleaved access group. + bool isAccessInterleaved(Instruction *Instr) { + return LAI->isAccessInterleaved(Instr); + } + + /// \brief Get the interleaved access group that \p I belongs to. + const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) { + return LAI->getInterleavedAccessGroup(Instr); } unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } @@ -1657,6 +1672,256 @@ "reverse"); } +// Get a mask to interleave \p NumVec vectors into a wide vector. +// I.E <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> +// E.g. For 2 interleaved vectors, if VF is 4, the mask is: +// <0, 4, 1, 5, 2, 6, 3, 7> +static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF, + unsigned NumVec) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < NumVec; j++) + Mask.push_back(Builder.getInt32(j * VF + i)); + + return ConstantVector::get(Mask); +} + +// Get the strided mask starting from index \p Start. +// I.E. +static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, + unsigned Stride, unsigned VF) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + Mask.push_back(Builder.getInt32(Start + i * Stride)); + + return ConstantVector::get(Mask); +} + +// Get a mask of two parts: The first part consist of sequential integers +// starting from 0, The second part consist of UNDEFs. +// I.E. <0, 1, 2, ..., NumInt - 1, undef, ..., undef> +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt, + unsigned NumUndef) { + SmallVector Mask; + for (unsigned i = 0; i < NumInt; i++) + Mask.push_back(Builder.getInt32(i)); + + // Return directly if there is no undef value. + if (!NumUndef) + return ConstantVector::get(Mask); + + Constant *Undef = UndefValue::get(Builder.getInt32Ty()); + for (unsigned i = 0; i < NumUndef; i++) + Mask.push_back(Undef); + return ConstantVector::get(Mask); +} + +// Concatenate two vectors with the same element type. The 2nd vector should +// not have more elements than the 1st vector. If the 2nd vector has less +// elements, extend it with UNDEFs. +static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, + Value *V2) { + VectorType *VecTy1 = dyn_cast(V1->getType()); + VectorType *VecTy2 = dyn_cast(V2->getType()); + assert(VecTy1 && VecTy2 && + VecTy1->getScalarType() == VecTy2->getScalarType() && + "Expect two vectors with the same element type"); + + unsigned NumElts1 = VecTy1->getNumElements(); + unsigned NumElts2 = VecTy2->getNumElements(); + assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); + + if (NumElts1 > NumElts2) { + // Extend with UNDEFs. + Constant *ExtMask = + getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2); + V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); + } + + Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0); + return Builder.CreateShuffleVector(V1, V2, Mask); +} + +// Concatenate vectors in the given list. All vectors have the same type. +static Value *ConcatenateVectors(IRBuilder<> &Builder, + ArrayRef InputList) { + unsigned NumVec = InputList.size(); + assert(NumVec > 1 && "Should be at least two vectors"); + + SmallVector VecList; + SmallVector TmpList; + VecList.append(InputList.begin(), InputList.end()); + do { + for (unsigned i = 0; i < NumVec / 2; i++) + TmpList.push_back( + ConcatenateTwoVectors(Builder, VecList[2 * i], VecList[2 * i + 1])); + + // Push the last vector if the total number of vectors is odd. + if (NumVec % 2 != 0) + TmpList.push_back(VecList[NumVec - 1]); + + VecList.clear(); + VecList.append(TmpList.begin(), TmpList.end()); + NumVec = VecList.size(); + TmpList.clear(); + } while (NumVec > 1); + + return VecList[0]; +} + +// Try to vectorize the interleave group that \p Instr belongs to. +// +// E.g. Translate following interleaved load group (Delta is 3): +// for (i = 0; i < N; i+=3) { +// R = Pic[i]; // Member of index 0 +// G = Pic[i+1]; // Member of index 1 +// B = Pic[i+2]; // Member of index 2 +// ... // do something to R, G, B +// } +// To: +// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B +// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements +// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements +// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements +// +// Or translate following interleaved store group (Delta is 3): +// for (i = 0; i < N; i+=3) { +// ... do something to R, G, B +// Pic[i] = R; // Member of index 0 +// Pic[i+1] = G; // Member of index 1 +// Pic[i+2] = B; // Member of index 2 +// } +// To: +// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> +// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> +// %interleaved.vec = shuffle %R_G.vec, %B_U.vec, +// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements +// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B +void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) { + const InterleaveGroup *Group = Legal->getInterleavedAccessGroup(Instr); + assert(Group && "Fail to get an interleaved access group."); + + // Skip if current instruction is not the insert position. + if (Instr != Group->getInsertPos()) + return; + + LoadInst *LI = dyn_cast(Instr); + StoreInst *SI = dyn_cast(Instr); + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + unsigned Delta = Group->getDelta(); + Type *VecTy = VectorType::get(ScalarTy, Delta * VF); + Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + + // Prepare for the new pointers. + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrParts = getVectorValue(Ptr); + SmallVector NewPtrs; + unsigned Index = Group->getIndex(Instr); + for (unsigned Part = 0; Part < UF; Part++) { + // Notice current instruction could be any index. Need to adjust the address + // to the member of index 0. + // + // E.g. a = A[i+1]; // Member of index 1 (Current instruction) + // b = A[i]; // Member of index 0 + // Current pointer is pointed to A[i+1], adjust it to A[i]. + // + // E.g. A[i+1] = a; // Member of index 1 + // A[i] = b; // Member of index 0 + // A[i+2] = c; // Member of index 2 (Current instruction) + // Current pointer is pointed to A[i+2], adjust it to A[i]. + Value *NewPtr = + Builder.CreateExtractElement(PtrParts[Part], Builder.getInt32(0)); + NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index)); + + // If the address is reverse, then the vector load/store needs to start at + // the last access of index 0. + // E.g. For the reversed access of interleaved load with 2 members: + // {A[i], A[i+1]}, {A[i-2], A[i-1]}, {A[i-4], A[i-3]}, {A[i-6], A[i-5]} + // The new pointer is now pointed to A[i]. Adjust it to A[i-6]. + if (Group->isReverse()) + NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-(Delta * (VF - 1)))); + + // Cast to the vector pointer type. + NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); + } + + setDebugLocFromInst(Builder, Instr); + Value *UndefVec = UndefValue::get(VecTy); + + // Vectorize the interleaved load group. + if (LI) { + for (unsigned Part = 0; Part < UF; Part++) { + Instruction *CallI = Builder.CreateAlignedLoad( + NewPtrs[Part], Group->getAlign(), "wide.vec"); + + for (unsigned i = 0; i < Delta; i++) { + Instruction *Member = Group->getMember(i); + + // Skip the gaps in the group. + if (!Member) + continue; + + Constant *StrideMask = getStridedMask(Builder, i, Delta, VF); + Value *StridedVec = Builder.CreateShuffleVector( + CallI, UndefVec, StrideMask, "strided.vec"); + + // If this member has different type, cast the result type. + if (Member->getType() != ScalarTy) { + VectorType *OtherVTy = VectorType::get(Member->getType(), VF); + StridedVec = Builder.CreateBitOrPointerCast(StridedVec, OtherVTy); + } + + VectorParts &Entry = WidenMap.get(Member); + Entry[Part] = + Group->isReverse() ? reverseVector(StridedVec) : StridedVec; + } + + propagateMetadata(CallI, Instr); + } + return; + } + + // The sub vector type for current instruction. + VectorType *SubVT = VectorType::get(ScalarTy, VF); + + // Vectorize the interleaved store group. + for (unsigned Part = 0; Part < UF; Part++) { + // Collect the stored vector from each member. + SmallVector StoredVecs; + for (unsigned i = 0; i < Delta; i++) { + // Interleaved store group doesn't allow a gap, so each index has a member + Instruction *Member = Group->getMember(i); + assert(Member && "Fail to get a member from an interleaved store group"); + + Value *StoredVec = + getVectorValue(dyn_cast(Member)->getValueOperand())[Part]; + if (Group->isReverse()) + StoredVec = reverseVector(StoredVec); + + // If this member has different type, cast it to a unified type. + if (StoredVec->getType() != SubVT) + StoredVec = Builder.CreateBitOrPointerCast(StoredVec, SubVT); + + StoredVecs.push_back(StoredVec); + } + + // Concatenate all vectors into a wide vector. + Value *WideVec = ConcatenateVectors(Builder, StoredVecs); + + // Interleave the elements in the vector. + Constant *IMask = getInterleavedMask(Builder, VF, Delta); + Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, + "interleaved.vec"); + + Instruction *CallI = + Builder.CreateAlignedStore(IVec, NewPtrs[Part], Group->getAlign()); + propagateMetadata(CallI, Instr); + } +} + void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); @@ -1664,6 +1929,10 @@ assert((LI || SI) && "Invalid Load/Store instruction"); + // Try to vectorize the interleave group if this access is interleaved. + if (Legal->isAccessInterleaved(Instr)) + return vectorizeInterleaveGroup(Instr); + Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); @@ -4575,6 +4844,41 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + // Interleaved access + if (Legal->isAccessInterleaved(I)) { + auto Group = Legal->getInterleavedAccessGroup(I); + assert(Group && "Fail to get an interleaved access group."); + + // Only calculate the cost once. + if (Group->getInsertPos() != I) + return 0; + + unsigned Delta = Group->getDelta(); + Type *WideVecTy = + VectorType::get(VectorTy->getVectorElementType(), + VectorTy->getVectorNumElements() * Delta); + + SmallVector Indices; + if (LI) { + // Collect the gaps in interleaved loads. No need to do this for + // interleaved stores, which only support fully interleave without gap. + for (unsigned i = 0; i < Delta; i++) + if (Group->getMember(i)) + Indices.push_back(i); + } + + // Calculate the cost of the whole interleaved group. + unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy, + Group->getDelta(), Indices, + Group->getAlign(), AS); + + if (Group->isReverse()) + Cost += + Group->getNumMembers() * + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; + } + // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; Index: test/Analysis/LoopAccessAnalysis/analyze-interleaving.ll =================================================================== --- /dev/null +++ test/Analysis/LoopAccessAnalysis/analyze-interleaving.ll @@ -0,0 +1,544 @@ +; RUN: opt -S -loop-accesses -analyze -enable-interleaving=true -runtime-memory-check-threshold=24 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Check an interleaved load group of Delta 2 and an interleaved store group of +; Delta 2. + +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } + +; CHECK: Interleaved access groups: +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: +; CHECK-NEXT: Index 1: +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: +; CHECK-NEXT: Index 1: + +%struct.ST2 = type { i32, i32 } +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx0, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %C + %mul = mul nsw i32 %2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Check an interleaved load group of Delta 3 and an interleaved store group of +; Delta 3. + +; int A[3072]; +; struct ST3 S[1024]; +; void test_struct_st3() { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X1 = *ptr++; +; int X2 = *ptr++; +; int X3 = *ptr++; +; S[i].x = X1 + 1; +; S[i].y = X2 + 2; +; S[i].z = X3 + 3; +; } +; } + +; CHECK: Interleaved access groups: +; CHECK: Interleave Group of Delta 3: +; CHECK-NEXT: Index 0: +; CHECK-NEXT: Index 1: +; CHECK-NEXT: Index 2: +; CHECK: Interleave Group of Delta 3: +; CHECK-NEXT: Index 0: +; CHECK-NEXT: Index 1: +; CHECK-NEXT: Index 2: + +%struct.ST3 = type { i32, i32, i32 } +@A = common global [3072 x i32] zeroinitializer, align 4 +@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 + +define void @test_struct_array_load3_store3() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 + %0 = load i32, i32* %ptr.016, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 + %1 = load i32, i32* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 + %2 = load i32, i32* %incdec.ptr1, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %add3 = add nsw i32 %1, 2 + %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 + store i32 %add3, i32* %y, align 4 + %add6 = add nsw i32 %2, 3 + %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 + store i32 %add6, i32* %z, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Check an interleaved load group of Delta 4. + +; struct ST4{ +; int x; +; int y; +; int z; +; int w; +; }; +; int test_struct_load4(struct ST4 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; r -= S[i].w; +; } +; return r; +; } + +; CHECK: Interleaved access groups: +; CHECK: Interleave Group of Delta 4: +; CHECK-NEXT: Index 0: %0 = load i32, i32* %x, align 4 +; CHECK-NEXT: Index 1: %1 = load i32, i32* %y, align 4 +; CHECK-NEXT: Index 2: %2 = load i32, i32* %z, align 4 +; CHECK-NEXT: Index 3: %3 = load i32, i32* %w, align 4 + +%struct.ST4 = type { i32, i32, i32, i32 } + +define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.022 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 + %3 = load i32, i32* %w, align 4 + %sub8 = sub i32 %add5, %3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %sub8 +} + +; Check on an interleaved store group of Delta 4. + +; int B[1024]; +; struct ST4 T[1024]; +; void test_struct_store4() { +; int *ptr = B; +; for (int i = 0; i < 1024; i++) { +; int X = *ptr++; +; T[i].x = X + 1; +; T[i].y = X * 2; +; T[i].z = X + 3; +; T[i].w = X + 4; +; } +; } + +; CHECK: Interleaved access groups: +; CHECK: Interleave Group of Delta 4: +; CHECK-NEXT: Index 0: store i32 %add, i32* %x, align 4 +; CHECK-NEXT: Index 1: store i32 %mul, i32* %y, align 4 +; CHECK-NEXT: Index 2: store i32 %sub, i32* %z, align 4 +; CHECK-NEXT: Index 3: store i32 %add5, i32* %w, align 4 + +@B = common global [1024 x i32] zeroinitializer, align 4 +@T = common global [1024 x %struct.ST4] zeroinitializer, align 4 + +define void @test_struct_store4() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1 + %0 = load i32, i32* %ptr.017, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %mul = shl nsw i32 %0, 1 + %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1 + store i32 %mul, i32* %y, align 4 + %sub = add nsw i32 %0, 3 + %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2 + store i32 %sub, i32* %z, align 4 + %add5 = add nsw i32 %0, 4 + %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3 + store i32 %add5, i32* %w, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Check a reverse interleaved load group of Delta 4 and a reverse interleaved +; store group of Delta 4. + +; struct ST2 { +; int x; +; int y; +; }; +; +; void test_reverse_load2_store2(struct ST2 *A, struct ST2 *B) { +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; // interleaved load of index 0 +; int b = A[i].y - i; // interleaved load of index 1 +; B[i].x = a; // interleaved store of index 0 +; B[i].y = b; // interleaved store of index 1 +; } +; } + +; CHECK: Interleaved access groups: +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: +; CHECK-NEXT: Index 1: +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: +; CHECK-NEXT: Index 1: + +define void @test_reverse_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %1 = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %0, %1 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %2 = load i32, i32* %y, align 4 + %sub = sub nsw i32 %2, %1 + %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x5, align 4 + %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 + store i32 %sub, i32* %y8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check an interleaved load group of Delta 2 with 1 gap (without the load on +; odd element). + +; void even_load(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) +; B[i/2] = A[i] * 2; +; } + +; CHECK: Interleaved access groups: +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: +; CHECK-NOT: Index 1 + +define void @even_load(i32* nocapture readonly %A, i32* nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %1 = lshr exact i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1 + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check an interleaved store group of Delta 2 with 1 gap. We should not identify +; such case as a group. + +; void even_store(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) +; A[i] = B[i] + B[i+1]; +; } + +; CHECK: Interleaved access groups: +; CHECK: Interleave Group of Delta 2: +; CHECK-NOT: Index 0: store + +define void @even_store(i32* nocapture %A, i32* nocapture readonly %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %2, %0 + %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %add3, i32* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check access groups identified from mixed loads and stores. + +; void mixed_load_store(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) { +; B[i] = A[i] * A[i+1]; +; B[i+1] = A[i] + A[i+1]; +; } +; } + +; CHECK: Interleaved access groups: +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: +; CHECK-NEXT: Index 1: +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: +; CHECK-NEXT: Index 1: +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: +; CHECK-NEXT: Index 1: + +define void @mixed_load_store(i32* nocapture readonly %A, i32* nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %2, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %3 = load i32, i32* %arrayidx, align 4 + %4 = load i32, i32* %arrayidx2, align 4 + %add10 = add nsw i32 %4, %3 + %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %1 + store i32 %add10, i32* %arrayidx13, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check access groups identified from members with different kinds of type. + +; struct IntFloat { +; int a; +; float b; +; }; +; +; int SA; +; float SB; +; +; void int_float_struct(struct IntFloat *A) { +; int SumA; +; float SumB; +; for (unsigned i = 0; i < 1024; i++) { +; SumA += A[i].a; +; SumB += A[i].b; +; } +; SA = SumA; +; SB = SumB; +; } + +; CHECK: Interleaved access groups: +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: +; CHECK-NEXT: Index 1: + +%struct.IntFloat = type { i32, float } + +@SA = common global i32 0, align 4 +@SB = common global float 0.000000e+00, align 4 + +define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + store i32 %add, i32* @SA, align 4 + store float %add3, float* @SB, align 4 + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] + %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] + %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %a, align 4 + %add = add nsw i32 %0, %SumA.013 + %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1 + %1 = load float, float* %b, align 4 + %add3 = fadd fast float %SumB.014, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { "unsafe-fp-math"="true" } + +; Check two interleaved store groups which will break write after write +; dependence. We should not identify them as interleaved groups. + +; void waw_dep(int *A) { +; char *Tmp = (char *)A; +; int *B = (int *) (++Tmp); +; for(int i = 0; i < 1024; i+=2) { +; A[i] = i; // (1) +; B[i] = i; // (2) +; A[i+1] = i + 1; // (3) +; B[i+1] = i + 1; // (4) +; } +; } +; +; The combine of (1)+(3) or (2)+(4) will break the dependence. + +; CHECK: Interleaved access groups: +; CHECK-NOT: Interleave Group: + +define void @waw_dep(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %incdec.ptr = getelementptr inbounds i8, i8* %0, i64 1 + %1 = bitcast i8* %incdec.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %2 = trunc i64 %indvars.iv to i32 + store i32 %2, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + store i32 %2, i32* %arrayidx2, align 4 + %3 = or i64 %indvars.iv, 1 + %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %3 + %4 = trunc i64 %3 to i32 + store i32 %4, i32* %arrayidx5, align 4 + %arrayidx9 = getelementptr inbounds i32, i32* %1, i64 %3 + store i32 %4, i32* %arrayidx9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check two interleaved store groups which won't break write after write +; dependence. + +; void no_waw_dep(int *A) { +; char *Tmp = (char *)A; +; int *B = (int *) (++Tmp); +; for(int i = 0; i < 1024; i+=2) { +; A[i] = i; // (1) +; A[i+1] = i + 1; // (2) +; B[i] = i; // (3) +; B[i+1] = i + 1; // (4) +; } +; } +; +; The combine of (1)+(2) or (3)+(4) won't break the dependence. + +; CHECK: Interleaved access groups: +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: store i32 %2 +; CHECK-NEXT: Index 1: store i32 %4 +; CHECK: Interleave Group of Delta 2: +; CHECK-NEXT: Index 0: store i32 %2 +; CHECK-NEXT: Index 1: store i32 %4 + +define void @no_waw_dep(i32* nocapture %A) { +entry: + %0 = bitcast i32* %A to i8* + %incdec.ptr = getelementptr inbounds i8, i8* %0, i64 1 + %1 = bitcast i8* %incdec.ptr to i32* + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %2 = trunc i64 %indvars.iv to i32 + store i32 %2, i32* %arrayidx, align 4 + %3 = or i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %3 + %4 = trunc i64 %3 to i32 + store i32 %4, i32* %arrayidx3, align 4 + %arrayidx5 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv + store i32 %2, i32* %arrayidx5, align 4 + %arrayidx9 = getelementptr inbounds i32, i32* %1, i64 %3 + store i32 %4, i32* %arrayidx9, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -1,5 +1,5 @@ -; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s -; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -enable-interleaving=true | FileCheck %s +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -enable-interleaving=true | FileCheck %s --check-prefix=FORCE-VEC target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnueabi" @@ -102,26 +102,23 @@ ; } ; CHECK-LABEL: @ptr_ind_plus2( -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: mul nsw i32 -; CHECK: mul nsw i32 -; CHECK: add nsw i32 -; CHECK: add nsw i32 -; CHECK: %index.next = add i64 %index, 2 -; CHECK: %21 = icmp eq i64 %index.next, 1024 +; CHECK: %[[V0:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: %index.next = add i64 %index, 8 +; CHECK: icmp eq i64 %index.next, 1024 ; FORCE-VEC-LABEL: @ptr_ind_plus2( -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> +; FORCE-VEC: %[[V:.*]] = load <4 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> ; FORCE-VEC: mul nsw <2 x i32> ; FORCE-VEC: add nsw <2 x i32> ; FORCE-VEC: %index.next = add i64 %index, 2 Index: test/Transforms/LoopVectorize/interleaved-accesses.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -0,0 +1,427 @@ +; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaving=true -runtime-memory-check-threshold=24 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Check vectorization on an interleaved load group of Delta 2 and an interleaved +; store group of Delta 2. + +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } + +; CHECK-LABEL: @test_array_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +%struct.ST2 = type { i32, i32 } +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx0, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %C + %mul = mul nsw i32 %2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on an interleaved load group of Delta 3 and an interleaved +; store group of Delta 3. + +; int A[3072]; +; struct ST S[1024]; +; void test_struct_st3() { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X1 = *ptr++; +; int X2 = *ptr++; +; int X3 = *ptr++; +; T[i].x = X1 + 1; +; T[i].y = X2 + 2; +; T[i].z = X3 + 3; +; } +; } + +; CHECK-LABEL: @test_struct_array_load3_store3( +; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> +; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4 + +%struct.ST3 = type { i32, i32, i32 } +@A = common global [3072 x i32] zeroinitializer, align 4 +@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 + +define void @test_struct_array_load3_store3() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 + %0 = load i32, i32* %ptr.016, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 + %1 = load i32, i32* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 + %2 = load i32, i32* %incdec.ptr1, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %add3 = add nsw i32 %1, 2 + %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 + store i32 %add3, i32* %y, align 4 + %add6 = add nsw i32 %2, 3 + %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 + store i32 %add6, i32* %z, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on an interleaved load group of Delta 4. + +; struct ST4{ +; int x; +; int y; +; int z; +; int w; +; }; +; int test_struct_load4(struct ST4 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; r -= S[i].w; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load4( +; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> + +%struct.ST4 = type { i32, i32, i32, i32 } + +define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.022 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 + %3 = load i32, i32* %w, align 4 + %sub8 = sub i32 %add5, %3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %sub8 +} + +; Check vectorization on an interleaved store group of Delta 4. + +; int B[1024]; +; struct ST T[1024]; +; void test_struct_store4() { +; int *ptr = B; +; for (int i = 0; i < 1024; i++) { +; int X = *ptr++; +; T[i].x = X + 1; +; T[i].y = X * 2; +; T[i].z = X + 3; +; T[i].w = X + 4; +; } +; } + +; CHECK-LABEL: @test_struct_store4( +; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shl nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> +; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4 + +@B = common global [1024 x i32] zeroinitializer, align 4 +@T = common global [1024 x %struct.ST4] zeroinitializer, align 4 + +define void @test_struct_store4() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1 + %0 = load i32, i32* %ptr.017, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %mul = shl nsw i32 %0, 1 + %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1 + store i32 %mul, i32* %y, align 4 + %sub = add nsw i32 %0, 3 + %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2 + store i32 %sub, i32* %z, align 4 + %add5 = add nsw i32 %0, 4 + %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3 + store i32 %add5, i32* %w, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Check vectorization on a reverse interleaved load group of Delta 2 and +; a reverse interleaved store group of Delta 2. + +; struct ST2 { +; int x; +; int y; +; }; +; +; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; // interleaved load of index 0 +; int b = A[i].y - i; // interleaved load of index 1 +; B[i].x = a; // interleaved store of index 0 +; B[i].y = b; // interleaved store of index 1 +; } +; } + +; CHECK-LABEL: @test_reversed_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub nsw <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +define void @test_reversed_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %1 = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %0, %1 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %2 = load i32, i32* %y, align 4 + %sub = sub nsw i32 %2, %1 + %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x5, align 4 + %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 + store i32 %sub, i32* %y8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on an interleaved load group of Delta 2 with 1 gap +; (missing the load of odd elements). + +; void even_load(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) +; B[i/2] = A[i] * 2; +; } + +; CHECK-LABEL: @even_load( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shl nsw <4 x i32> %strided.vec, +define void @even_load(i32* nocapture readonly %A, i32* nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %1 = lshr exact i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1 + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on interleaved access groups identified from mixed +; loads/stores. +; void mixed_load_store(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) { +; B[i] = A[i] * A[i+1]; +; B[i+1] = A[i] + A[i+1]; +; } +; } + +; CHECK-LABEL: @mixed_load_store( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec +define void @mixed_load_store(i32* nocapture readonly %A, i32* nocapture %B) #0 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %2, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %3 = load i32, i32* %arrayidx, align 4 + %4 = load i32, i32* %arrayidx2, align 4 + %add10 = add nsw i32 %4, %3 + %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %1 + store i32 %add10, i32* %arrayidx13, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check vectorization on interleaved access groups with members having different +; kinds of type. + +; struct IntFloat { +; int a; +; float b; +; }; +; +; int SA; +; float SB; +; +; void int_float_struct(struct IntFloat *A) { +; int SumA; +; float SumB; +; for (unsigned i = 0; i < 1024; i++) { +; SumA += A[i].a; +; SumB += A[i].b; +; } +; SA = SumA; +; SB = SumB; +; } + +; CHECK-LABEL: @int_float_struct( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float> +; CHECK: add nsw <4 x i32> +; CHECK: fadd fast <4 x float> + +%struct.IntFloat = type { i32, float } + +@SA = common global i32 0, align 4 +@SB = common global float 0.000000e+00, align 4 + +define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + store i32 %add, i32* @SA, align 4 + store float %add3, float* @SB, align 4 + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] + %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] + %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %a, align 4 + %add = add nsw i32 %0, %SumA.013 + %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1 + %1 = load float, float* %b, align 4 + %add3 = fadd fast float %SumB.014, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { "unsafe-fp-math"="true" }