Index: include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- include/llvm/Analysis/LoopAccessAnalysis.h +++ include/llvm/Analysis/LoopAccessAnalysis.h @@ -368,11 +368,126 @@ SmallVector AliasSetId; }; + struct InterleavedAccess { + InterleavedAccess(Instruction *Leader, int MBStride, unsigned Align) + : Align(Align), InsertPos(nullptr) { + // The Stride of interleaved access is absolute value of member stride. + Stride = std::abs(MBStride); + Reverse = MBStride < 0; + Members.push_back(std::make_pair(Leader, 0)); // Member of index 0 + } + + InterleavedAccess() + : Stride(0), Reverse(false), Align(0), InsertPos(nullptr) {} + + bool isReverse() const { return Reverse; } + unsigned getStride() const { return Stride; }; + unsigned getAlign() const { return Align; } + + // Fully interleaved access has all the members from index 0 to (Stride -1). + // Returns the number of missing members. + unsigned getNumGaps() const { + unsigned NumMembers = Members.size(); + assert(NumMembers <= Stride && "Unexpected number of members"); + return Stride - NumMembers; + } + + // Return true if successfully insert a new member. \p Index is relative + // to the leader (The member with index 0), so it could be negative. + // This could fail if it breaks two rules below: + // 1. The max index is always less than the stride. + // 2. There is already a member with same index. + bool insertMember(Instruction *Inst, int Index, unsigned Alignment) { + unsigned AbsIdx = std::abs(Index); + if (Index < 0) { + unsigned LIndex = Members.back().second; + // Break rule 1. Doesn't belong to this group. + if (LIndex + AbsIdx >= Stride) + return false; + + for (auto &I : Members) + I.second += AbsIdx; + + Members.insert(Members.begin(), std::make_pair(Inst, 0)); + Align = std::min(Align, Alignment); + return true; + } + + // Break rule 1. Doesn't belong to this group. + if (AbsIdx >= Stride) + return false; + + auto I = Members.begin(); + for (auto E = Members.end(); I != E; ++I) { + if (I->second > AbsIdx) + break; + + // Break rule 2. There is already a member with the same index. + if (I->second == AbsIdx) + return false; + } + + Members.insert(I, std::make_pair(Inst, AbsIdx)); + Align = std::min(Align, Alignment); + return true; + } + + void removeMember(Instruction *Inst) { + for (auto I = Members.begin(), E = Members.end(); I != E; ++I) { + if (I->first != Inst) + continue; + + Members.erase(I); + return; + } + } + + const ArrayRef> getMembers() const { + return Members; + } + + unsigned getIndex(Instruction *Inst) const { + for (auto I : Members) + if (I.first == Inst) + return I.second; + llvm_unreachable("Contains no such member"); + } + + Instruction *getInsertPos() const { return InsertPos; } + void setInsertPos(Instruction *Inst) { InsertPos = Inst; } + + private: + unsigned Stride; + bool Reverse; + unsigned Align; + SmallVector, 4> Members; + + // To avoid breaking dependences, an interleaved access should be inserted + // at either the first load or the last store in program order. + // E.g. %even = load i32 // Insert Position + // %add = add i32 %even // Use of %even + // %odd = load i32 + // + // store i32 %even + // %odd = add i32 // Def of %odd + // store i32 %odd // Insert Position + Instruction *InsertPos; + }; + LoopAccessInfo(Loop *L, ScalarEvolution *SE, const DataLayout &DL, const TargetLibraryInfo *TLI, AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, const ValueToValueMap &Strides); + ~LoopAccessInfo() { + SmallSet DelSet; + // Collecting all the pointers to avoid delete a pointer twice. + for (auto &I : InterleavedAccessMap) + DelSet.insert(I.second); + for (auto *Ptr : DelSet) + delete Ptr; + } + /// Return true we can analyze the memory accesses in the loop and there are /// no memory dependence cycles. bool canVectorizeMemory() const { return CanVecMem; } @@ -440,10 +555,40 @@ return StoreToLoopInvariantAddress; } + bool isInterleaved(Instruction *I) const { + return InterleavedAccessMap.count(I); + } + + /// \brief Return the interleaved access the given value \p V belongs to. + /// Return nullptr if \p V doesn't belong to any interleaved access. + const LoopAccessInfo::InterleavedAccess * + getInterleavedAccess(Instruction *I) const { + if (!InterleavedAccessMap.count(I)) + return nullptr; + return InterleavedAccessMap.find(I)->second; + } + private: /// \brief Analyze the loop. Substitute symbolic strides using Strides. void analyzeLoop(const ValueToValueMap &Strides); + /// \brief Analyze the loop dependences. \p Loads and \p Stores contain + /// loads/stores in program order. Substitute symbolic strides using Strides. + void analyzeDependences(ArrayRef Loads, ArrayRef Stores, + bool IsAnnotatedParallel, + const ValueToValueMap &Strides); + + /// \brief Analyze the interelaved accesses. The \p AccessList constains + /// memory accesses in program order. + void analyzeInterleaving(ArrayRef AccessList, + const ValueToValueMap &Strides); + + /// \brief Group loads/stores into interleaved accesses according to their + /// address, stride, memory object size. \p AccessList constains memory + /// accesses in program order. + void groupInterleavedAccess(ArrayRef AccessList, + const ValueToValueMap &Strides); + /// \brief Check if the structure of the loop allows it to be analyzed by this /// pass. bool canAnalyzeLoop(); @@ -485,6 +630,9 @@ /// \brief The diagnostics report generated for the analysis. E.g. why we /// couldn't analyze the loop. Optional Report; + + /// Contains the relationships between the members and the interleaved access. + DenseMap InterleavedAccessMap; }; Value *stripIntegerCast(Value *V); Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -444,6 +444,16 @@ unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; + /// \return The cost of one member in the interleaved access. + /// \p Vecty is the wide vecotor type of the whole interleaved access. + /// \p SubTy is the vector type of the member. + /// \p Index is the index of current member in the interleaved access. + /// \p Gap is the number of missing members + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy, + unsigned Index, unsigned Gap, + unsigned Alignment, + unsigned AddressSpace) const; + /// \brief Calculate the cost of performing a vector reduction. /// /// This is the cost of reducing the vector value of type \p Ty to a scalar @@ -582,6 +592,10 @@ virtual unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) = 0; + virtual unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + Type *SubTy, unsigned Index, + unsigned Gap, unsigned Alignment, + unsigned AddressSpace) = 0; virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) = 0; virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, @@ -740,6 +754,13 @@ unsigned AddressSpace) override { return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy, + unsigned Index, unsigned Gap, + unsigned Alignment, + unsigned AddressSpace) override { + return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, SubTy, Index, Gap, + Alignment, AddressSpace); + } unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) override { return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -300,6 +300,13 @@ return 1; } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy, + unsigned Index, unsigned Gap, + unsigned Alignment, + unsigned AddressSpace) { + return 1; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) { return 1; Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -522,6 +522,53 @@ return Cost; } + unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, Type *SubTy, + unsigned Index, unsigned Gap, + unsigned Alignment, + unsigned AddressSpace) { + VectorType *VT = dyn_cast(VecTy); + VectorType *SubVT = dyn_cast(SubTy); + assert(VT && SubVT && "Expect vector types"); + + unsigned NumElts = VT->getNumElements(); + unsigned SubNumElts = SubVT->getNumElements(); + unsigned Stride = NumElts / SubNumElts; + assert(NumElts % SubNumElts == 0 && Stride > 1 && "Invalide vector types"); + assert(Index < Stride && "Invalid Index"); + assert(Gap < Stride && "Invalid Gap"); + + // The memory op cost is equal to the cost of the whole wide vector divied + // by the actual number of members. + unsigned Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); + Cost = Cost / (Stride - Gap); + + // Then need to calculate the cost for interleave operations. + if (Opcode == Instruction::Load) { + // E.g. Interleaved load for the vector %V1 of index 1: + // %V = load <4 x i32>, <4 x i32>* %ptr + // %V1 = shufflevector <4 x i32> %V, <4 x i32> undef, <1, 3> + // The cost is extract element 1 and element 3 from the wide vector, and + // insert them sequentially into the sub vector. + for (unsigned i = 0; i < SubNumElts; i++) { + Cost += getVectorInstrCost(Instruction::ExtractElement, VecTy, + Index + i * Stride); + Cost += getVectorInstrCost(Instruction::InsertElement, SubTy, i); + } + } else { + // E.g. Interleaved store for the vector %V1 of index 1: + // %IV = shufflevector <2 x i32> %V0, <2 x i32> %V1, <0, 2, 1, 3> + // store <4 x i32> %IV, <4 x i32> %ptr + // The cost is extract element 0 and element 1 from %V1, and insert them + // into lane 1 and lane 3 of the wide vector. + for (unsigned i = 0; i < SubNumElts; i++) { + Cost += getVectorInstrCost(Instruction::ExtractElement, SubTy, i); + Cost += getVectorInstrCost(Instruction::InsertElement, VecTy, + Index + i * Stride); + } + } + return Cost; + } + unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef Tys) { unsigned ISD = 0; Index: lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- lib/Analysis/LoopAccessAnalysis.cpp +++ lib/Analysis/LoopAccessAnalysis.cpp @@ -41,6 +41,10 @@ VectorizerParams::VectorizationInterleave)); unsigned VectorizerParams::VectorizationInterleave; +static cl::opt AnalyzeInterleavedAccess( + "analyze-interleaved-access", cl::init(false), cl::Hidden, + cl::desc("Enable analyzing interleaved access in loop")); + static cl::opt RuntimeMemoryCheckThreshold( "runtime-memory-check-threshold", cl::Hidden, cl::desc("When performing memory disambiguation checks at runtime do not " @@ -777,21 +781,28 @@ VectorizerParams::VectorizationFactor : 1); unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? VectorizerParams::VectorizationInterleave : 1); - - // The distance must be bigger than the size needed for a vectorized version - // of the operation and the size of the vectorized operation must not be - // bigger than the currrent maximum size. - if (Distance < 2*TypeByteSize || - 2*TypeByteSize > MaxSafeDepDistBytes || - Distance < TypeByteSize * ForcedUnroll * ForcedFactor) { + // The number of iterations to be vectorized and unrolled. + unsigned NumIter = + std::max(ForcedFactor * ForcedUnroll, static_cast(2)); + + unsigned Stride = std::abs(StrideAPtr); + // Positive distance is safe when + // Distance <= MaxSafeDepDistBytes + // And either one below is true + // Distance < Stride * TypeByteSize + // Distance >= TypeByteSize * NumIter * Stride + if (!(Distance < Stride * TypeByteSize || + Distance >= TypeByteSize * NumIter * Stride) || + Distance > MaxSafeDepDistBytes) { DEBUG(dbgs() << "LAA: Failure because of Positive distance " << Val.getSExtValue() << '\n'); return Dependence::Backward; } - // Positive distance bigger than max vectorization factor. - MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ? - Distance : MaxSafeDepDistBytes; + // If Distance < Stride * TypeByteSize, it is always safe. just keep the + // current MaxSafeDepDistBytes. Otherwise, update the MaxSafeDepDistBytes. + if (Distance >= Stride * TypeByteSize) + MaxSafeDepDistBytes = Distance; bool IsTrueDataDependence = (!AIsWrite && BIsWrite); if (IsTrueDataDependence && @@ -947,89 +958,184 @@ return true; } -void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) { +struct StrideDescriptor { + StrideDescriptor(int Stride, const SCEV *Scev, unsigned Size, unsigned Align) + : Stride(Stride), Scev(Scev), Size(Size), Align(Align) {} - typedef SmallVector ValueVector; - typedef SmallPtrSet ValueSet; + StrideDescriptor() : Stride(0), Scev(nullptr), Size(0), Align(0) {} - // Holds the Load and Store *instructions*. - ValueVector Loads; - ValueVector Stores; + int Stride; + const SCEV *Scev; + unsigned Size; + unsigned Align; +}; - // Holds all the different accesses in the loop. - unsigned NumReads = 0; - unsigned NumReadWrites = 0; +void LoopAccessInfo::groupInterleavedAccess(ArrayRef InstList, + const ValueToValueMap &Strides) { + // Collect and record all the strided accesses. + DenseMap StridedAccesses; + for (auto I : InstList) { + // FIXME: To handle predicted loads/stores. + if (blockNeedsPredication(I->getParent(), TheLoop, DT)) + continue; + + LoadInst *LI = dyn_cast(I); + StoreInst *SI = dyn_cast(I); + assert((LI || SI) && "Invalid access instructon"); + + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + // Only analyze non-unit stride accesses. + int Stride = isStridedPtr(SE, Ptr, TheLoop, Strides); + if (std::abs(Stride) < 2) + continue; + + const SCEV *Scev = replaceSymbolicStrideSCEV(SE, Strides, Ptr); + PointerType *PtrTy = dyn_cast(Ptr->getType()); + unsigned Size = DL.getTypeAllocSize(PtrTy->getElementType()); + + // An alignment of 0 means target abi alignment. + unsigned Align = LI ? LI->getAlignment() : SI->getAlignment(); + if (!Align) + Align = DL.getABITypeAlignment(PtrTy->getElementType()); + + StridedAccesses[I] = StrideDescriptor(Stride, Scev, Size, Align); + } - PtrRtCheck.Pointers.clear(); - PtrRtCheck.Need = false; + // Search the load-load/write-write pair A-B in bottom-up order. Create a new + // interleaved access for A if A doesn't belong to any interleaved access. + // Try to insert B into the interleaved access of A according to 3 rules: + // 1. A and B have the same stride. + // 2. A and B has the same memory object size. + // 3. The distance of B to the leader of the interleaved access is less + // than the Stride. + // + // Even though the interleaved access vectorization may change the program + // order, no need to worry about the dependences. As the dependence check has + // already checked there is no overlap between pointers of different base. + // For the pointers sharing the same base. There may be three dependeces: + // + // The RAW (Read After Write) dependence. + // E.g. A[i] = a; + // b = A[i]; + // This won't exist as it is a store-load forwarding conflict, which has + // already been checked and forbidden in the dependence check. + // + // The WAW (Write After Write) dependence: + // E.g. A[i] = a; (1) + // A[i] = b; (2) + // The bottom-up search order gurantees (2) is always inserted into the + // interleaved store below the interleaved store that (1) belongs to. + // + // The WAR (Write After Read) dependence: + // E.g. a = A[i]; + // A[i] = b; + // The insert positon of interleaved store and interleaved load can gurantee + // the write is always after the read. + for (auto I = InstList.rbegin(), E = InstList.rend(); I != E; ++I) { + Instruction *A = *I; + if (!StridedAccesses.count(A)) + continue; + + StrideDescriptor DesA = StridedAccesses[A]; + + InterleavedAccess *IA; + if (isInterleaved(A)) { + IA = InterleavedAccessMap[A]; + } else { + // Create a new interleave access with A. + IA = new InterleavedAccess(A, DesA.Stride, DesA.Align); + InterleavedAccessMap[A] = IA; + } - const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + for (auto II = std::next(I); II != E; ++II) { + Instruction *B = *II; + if (A->mayReadFromMemory() != B->mayReadFromMemory()) + continue; - // For each block. - for (Loop::block_iterator bb = TheLoop->block_begin(), - be = TheLoop->block_end(); bb != be; ++bb) { + if (!StridedAccesses.count(B) || isInterleaved(B)) + continue; - // Scan the BB and collect legal loads and stores. - for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; - ++it) { + StrideDescriptor DesB = StridedAccesses[B]; - // If this is a load, save it. If this instruction can read from memory - // but is not a load, then we quit. Notice that we don't handle function - // calls that read or write. - if (it->mayReadFromMemory()) { - // Many math library functions read the rounding mode. We will only - // vectorize a loop if it contains known function calls that don't set - // the flag. Therefore, it is safe to ignore this read from memory. - CallInst *Call = dyn_cast(it); - if (Call && getIntrinsicIDForCall(Call, TLI)) - continue; + // Check the rule 1 and 2. + if (DesB.Stride != DesA.Stride || DesB.Size != DesA.Size) + continue; - // If the function has an explicit vectorized counterpart, we can safely - // assume that it can be vectorized. - if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() && - TLI->isFunctionVectorizable(Call->getCalledFunction()->getName())) - continue; + // Calculate the distance and prepare for the rule 3. + const SCEV *DistToA = SE->getMinusSCEV(DesB.Scev, DesA.Scev); + const SCEVConstant *C = dyn_cast(DistToA); + if (!C) + continue; - LoadInst *Ld = dyn_cast(it); - if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { - emitAnalysis(LoopAccessReport(Ld) - << "read with atomic ordering or volatile read"); - DEBUG(dbgs() << "LAA: Found a non-simple load.\n"); - CanVecMem = false; - return; - } - NumLoads++; - Loads.push_back(Ld); - DepChecker.addAccess(Ld); + int DistanceToA = C->getValue()->getValue().getZExtValue(); + // Read/write the same location. + if (!DistanceToA) continue; - } - // Save 'store' instructions. Abort if other instructions write to memory. - if (it->mayWriteToMemory()) { - StoreInst *St = dyn_cast(it); - if (!St) { - emitAnalysis(LoopAccessReport(it) << - "instruction cannot be vectorized"); - CanVecMem = false; - return; - } - if (!St->isSimple() && !IsAnnotatedParallel) { - emitAnalysis(LoopAccessReport(St) - << "write with atomic ordering or volatile write"); - DEBUG(dbgs() << "LAA: Found a non-simple store.\n"); - CanVecMem = false; - return; - } - NumStores++; - Stores.push_back(St); - DepChecker.addAccess(St); - } - } // Next instr. - } // Next block. + int Size = static_cast(DesA.Size); + assert(!(DistanceToA % Size) && "Distance must be multiple of the size"); + + // The index of B is equal to the index of A plus the related index. + int IndexToA = DistanceToA / Size; + int IndexA = static_cast(IA->getIndex(A)); + + // Try to insert B into the group with A. + if (IA->insertMember(B, IndexA + IndexToA, DesB.Align)) + InterleavedAccessMap[B] = IA; + continue; + } + } +} + +void LoopAccessInfo::analyzeInterleaving(ArrayRef InstList, + const ValueToValueMap &Strides) { + if (InstList.size() < 2) + return; + + groupInterleavedAccess(InstList, Strides); + + // Set insert positons and clean interleaved stores with gaps. + for (auto I : InstList) { + if (!isInterleaved(I)) + continue; + + InterleavedAccess *IA = InterleavedAccessMap[I]; + + // Set the insert position for a interleaved load. + if (I->mayReadFromMemory()) { + // The first member in program order if current insert position is null. + if (!IA->getInsertPos()) + IA->setInsertPos(I); + continue; + } + + // Set the insert position for a fully interleaved store. + if (!IA->getNumGaps()) { + // Finally, the insert position is the last member in program order. + IA->setInsertPos(I); + continue; + } - // Now we have two lists that hold the loads and the stores. - // Next, we find the pointers that they use. + // Erase a interleaved store with gaps. + ArrayRef> Members = IA->getMembers(); + for (auto I : Members) + InterleavedAccessMap.erase(I.first); + delete IA; + } +} +void LoopAccessInfo::analyzeDependences(ArrayRef Loads, + ArrayRef Stores, + bool IsAnnotatedParallel, + const ValueToValueMap &Strides) { + // Holds all the different accesses in the loop. + unsigned NumReads = 0; + unsigned NumReadWrites = 0; + + PtrRtCheck.Pointers.clear(); + PtrRtCheck.Need = false; + + typedef SmallPtrSet ValueSet; // Check if we see any stores. If there are no stores, then we don't // care if the pointers are *restrict*. if (!Stores.size()) { @@ -1049,9 +1155,9 @@ // writes and between reads and writes, but not between reads and reads. ValueSet Seen; - ValueVector::iterator I, IE; - for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) { - StoreInst *ST = cast(*I); + // Find the pointers that stores use. + for (auto I : Stores) { + StoreInst *ST = cast(I); Value* Ptr = ST->getPointerOperand(); // Check for store to loop invariant address. StoreToLoopInvariantAddress |= isUniform(Ptr); @@ -1072,15 +1178,15 @@ } if (IsAnnotatedParallel) { - DEBUG(dbgs() - << "LAA: A loop annotated parallel, ignore memory dependency " - << "checks.\n"); + DEBUG(dbgs() << "LAA: A loop annotated parallel, ignore memory dependency " + << "checks.\n"); CanVecMem = true; return; } - for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { - LoadInst *LD = cast(*I); + // Find the pointers that loads use. + for (auto I : Loads) { + LoadInst *LD = cast(I); Value* Ptr = LD->getPointerOperand(); // If we did *not* see this pointer before, insert it to the // read list. If we *did* see it before, then it is already in @@ -1192,6 +1298,89 @@ } } +void LoopAccessInfo::analyzeLoop(const ValueToValueMap &Strides) { + + typedef SmallVector ValueVector; + + // Holds the Load and Store *instructions* in program order. + ValueVector Loads; + ValueVector Stores; + // Holds the load/store instructions in program order. + SmallVector InstList; + + const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); + + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); + bb != be; ++bb) { + + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + + // If this is a load, save it. If this instruction can read from memory + // but is not a load, then we quit. Notice that we don't handle function + // calls that read or write. + if (it->mayReadFromMemory()) { + // Many math library functions read the rounding mode. We will only + // vectorize a loop if it contains known function calls that don't set + // the flag. Therefore, it is safe to ignore this read from memory. + CallInst *Call = dyn_cast(it); + if (Call && getIntrinsicIDForCall(Call, TLI)) + continue; + + // If the function has an explicit vectorized counterpart, we can safely + // assume that it can be vectorized. + if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() && + TLI->isFunctionVectorizable(Call->getCalledFunction()->getName())) + continue; + + LoadInst *Ld = dyn_cast(it); + if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) { + emitAnalysis(LoopAccessReport(Ld) + << "read with atomic ordering or volatile read"); + DEBUG(dbgs() << "LAA: Found a non-simple load.\n"); + CanVecMem = false; + return; + } + NumLoads++; + Loads.push_back(Ld); + InstList.push_back(Ld); + DepChecker.addAccess(Ld); + continue; + } + + // Save 'store' instructions. Abort if other instructions write to memory. + if (it->mayWriteToMemory()) { + StoreInst *St = dyn_cast(it); + if (!St) { + emitAnalysis(LoopAccessReport(it) + << "instruction cannot be vectorized"); + CanVecMem = false; + return; + } + if (!St->isSimple() && !IsAnnotatedParallel) { + emitAnalysis(LoopAccessReport(St) + << "write with atomic ordering or volatile write"); + DEBUG(dbgs() << "LAA: Found a non-simple store.\n"); + CanVecMem = false; + return; + } + NumStores++; + Stores.push_back(St); + InstList.push_back(St); + DepChecker.addAccess(St); + } + } // Next instr. + } // Next block. + + analyzeDependences(Loads, Stores, IsAnnotatedParallel, Strides); + + if (CanVecMem && AnalyzeInterleavedAccess) + analyzeInterleaving(InstList, Strides); +} + bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT) { assert(TheLoop->contains(BB) && "Unknown block used"); Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -235,6 +235,13 @@ return TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace); } +unsigned TargetTransformInfo::getInterleavedMemoryOpCost( + unsigned Opcode, Type *VecTy, Type *SubTy, unsigned Index, unsigned Gap, + unsigned Alignment, unsigned AddressSpace) const { + return TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, SubTy, Index, Gap, + Alignment, AddressSpace); +} + unsigned TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Tys) const { Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -34,6 +34,10 @@ // Variable uniformity checks are inspired by: // Karrenberg, R. and Hack, S. Whole Function Vectorization. // +// The interleaved access vectorization is based on the paper: +// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved +// Data for SIMD +// // Other ideas/concepts are from: // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. // @@ -351,6 +355,9 @@ /// broadcast them into a vector. VectorParts &getVectorValue(Value *V); + /// Try to vectorize the interleaved access that \p Instr belongs to. + void vectorizeInterleavedAccess(Instruction *Instr); + /// Generate a shuffle sequence that will reverse the vector Vec. virtual Value *reverseVector(Value *Vec); @@ -693,8 +700,13 @@ return LAI->getRuntimePointerCheck(); } - const LoopAccessInfo *getLAI() const { - return LAI; + const LoopAccessInfo *getLAI() const { return LAI; } + + bool isInterleaved(Instruction *I) { return LAI->isInterleaved(I); } + + const LoopAccessInfo::InterleavedAccess * + getInterleavedAccess(Instruction *I) { + return LAI->getInterleavedAccess(I); } unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); } @@ -1657,6 +1669,252 @@ "reverse"); } +// Get a mask to interleave a wide vector concatenated from \p NumVec small +// vectors. +// I.E <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...> +// E.g. For 2 interleaved vectors, if VF is 4, the mask is: +// <0, 4, 1, 5, 2, 6, 3, 7> +static Constant *getInterleavedStoreMask(IRBuilder<> &Builder, unsigned VF, + unsigned NumVec) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + for (unsigned j = 0; j < NumVec; j++) + Mask.push_back(Builder.getInt32(j * VF + i)); + + return ConstantVector::get(Mask); +} + +// Get the strided mask starting from index \p Start. +// I.E. +static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start, + unsigned Stride, unsigned VF) { + SmallVector Mask; + for (unsigned i = 0; i < VF; i++) + Mask.push_back(Builder.getInt32(Start + i * Stride)); + + return ConstantVector::get(Mask); +} + +// Get a mask of two parts: The first part consist of sequential constants, +// The second part consist of UNDEFs. +// I.E. +static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Idx, + unsigned NumSeq, unsigned NumUndef) { + SmallVector Mask; + for (unsigned i = Idx; i < Idx + NumSeq; i++) + Mask.push_back(Builder.getInt32(i)); + // Return directly if there is no undef value. + if (!NumUndef) + return ConstantVector::get(Mask); + + Constant *Undef = UndefValue::get(Builder.getInt32Ty()); + for (unsigned i = 0; i < NumUndef; i++) + Mask.push_back(Undef); + return ConstantVector::get(Mask); +} + +// Concatenate two vectors with the same element type. The 1st vector always +// has the same number of elements as the 2nd vector or more elements. +// If the 1st vector has more elements, extend the 2nd vector with UNDEFs. +static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1, + Value *V2) { + VectorType *VecTy1 = dyn_cast(V1->getType()); + VectorType *VecTy2 = dyn_cast(V2->getType()); + assert(VecTy1 && VecTy2 && + VecTy1->getScalarType() == VecTy2->getScalarType() && + "Expect two vectors with the same element type"); + + unsigned NumElts1 = VecTy1->getNumElements(); + unsigned NumElts2 = VecTy2->getNumElements(); + assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements"); + + if (NumElts1 > NumElts2) { + // Extend V2 with undef elements. + Constant *ExtMask = + getSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2); + V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask); + } + + Constant *Mask = getSequentialMask(Builder, 0, NumElts1 + NumElts2, 0); + return Builder.CreateShuffleVector(V1, V2, Mask); +} + +// Concatenate vectors in the given list. All vectors have the same type. +static Value *ConcatenateVectors(IRBuilder<> &Builder, ArrayRef List) { + unsigned NumVec = List.size(); + assert(NumVec > 1 && "Should be at least two vectors"); + + SmallVector Vecs; + SmallVector TmpList; + Vecs.append(List.begin(), List.end()); + do { + for (unsigned i = 0; i < NumVec / 2; i++) + TmpList.push_back( + ConcatenateTwoVectors(Builder, Vecs[2 * i], Vecs[2 * i + 1])); + + // Push the last vector if the total number of vectors is odd. + if (NumVec % 2 != 0) + TmpList.push_back(Vecs[NumVec - 1]); + + Vecs.clear(); + Vecs.append(TmpList.begin(), TmpList.end()); + NumVec = Vecs.size(); + TmpList.clear(); + } while (NumVec > 1); + + return Vecs[0]; +} + +// Try to vectorize the interleaved access that \p Instr belongs to. +// +// E.g. Translate following loads (VF is 4): +// for (i = 0; i < N; i+=3) { +// R = Pic[i]; +// G = Pic[i+1]; +// B = Pic[i+2]; +// ... // do something to R, G, B +// } +// To: +// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B +// %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9> ; R elements +// %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10> ; G elements +// %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11> ; B elements +// +// Or translate following stores (VF is 4): +// for (i = 0; i < N; i+=3) { +// ... do something to R, G, B +// Pic[i] = R; +// Pic[i+1] = G; +// Pic[i+2] = B; +// } +// To: +// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7> +// %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u> +// %interleaved.vec = shuffle %R_G.vec, %B_U.vec, +// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements +// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B +void InnerLoopVectorizer::vectorizeInterleavedAccess(Instruction *Instr) { + auto *IA = Legal->getInterleavedAccess(Instr); + assert(IA && "Doesn't belong to any interleaved access"); + + // Skip if current instruction is not the insert position. + if (Instr != IA->getInsertPos()) + return; + + LoadInst *LI = dyn_cast(Instr); + StoreInst *SI = dyn_cast(Instr); + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + + // Prepare for the vector type of the interleaved load/store. + Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + unsigned Stride = IA->getStride(); + Type *VecTy = VectorType::get(ScalarTy, Stride * VF); + Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace()); + + // Prepare for the new pointers. + setDebugLocFromInst(Builder, Ptr); + VectorParts &PtrParts = getVectorValue(Ptr); + SmallVector NewPtrs; + unsigned Idx = IA->getIndex(Instr); + for (unsigned Part = 0; Part < UF; Part++) { + // Notice current instruction could be any index. Need to adjust the address + // to the member of index 0. + // + // E.g. a = A[i+1]; // Member of index 1 (Current instruction) + // b = A[i]; // Member of index 0 + // Current pointer is pointed to A[i+1], adjust it to A[i]. + // + // E.g. A[i+1] = a; // Member of index 1 + // A[i] = b; // Member of index 0 + // A[i+2] = c; // Member of index 2 (Current instruction) + // Current pointer is pointed to A[i+2], adjust it to A[i]. + Value *NewPtr = + Builder.CreateExtractElement(PtrParts[Part], Builder.getInt32(0)); + NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Idx)); + + // If the address is reversed, then the wide load/store needs to start at + // the last access of index 0. + // E.g. For the reversed access of interleaved load with 2 members: + // {A[i], A[i+1]}, {A[i-2], A[i-1]}, {A[i-4], A[i-3]}, {A[i-6], A[i-5]} + // The new pointer is now pointed to A[i]. Adjust it to A[i-6]. + if (IA->isReverse()) + NewPtr = + Builder.CreateGEP(NewPtr, Builder.getInt32(-(Stride * (VF - 1)))); + + // Cast to the new wide vector pointer type. + NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy)); + } + + setDebugLocFromInst(Builder, Instr); + const ArrayRef> Members = IA->getMembers(); + Value *UndefVec = UndefValue::get(VecTy); + + // Vectorize the interleaved load. + if (LI) { + for (unsigned Part = 0; Part < UF; Part++) { + // Load a wide vector contains elements for all members. + Instruction *WideLoad = + Builder.CreateAlignedLoad(NewPtrs[Part], IA->getAlign(), "wide.vec"); + + for (auto Member : Members) { + // Extract the wide vector for current member according to the index. + Constant *StridedMask = + getStridedMask(Builder, Member.second, Stride, VF); + Value *StrideVec = Builder.CreateShuffleVector( + WideLoad, UndefVec, StridedMask, "strided.vec"); + + Instruction *MBInst = Member.first; + + // If this member has different type, cast the result type. + if (MBInst->getType() != Instr->getType()) { + VectorType *OtherVTy = VectorType::get(MBInst->getType(), VF); + StrideVec = Builder.CreateBitOrPointerCast(StrideVec, OtherVTy); + } + + VectorParts &Entry = WidenMap.get(MBInst); + Entry[Part] = IA->isReverse() ? reverseVector(StrideVec) : StrideVec; + } + + propagateMetadata(WideLoad, Instr); + } + return; + } + + // The narrow vector type for current instruction. + VectorType *MBVecTy = VectorType::get(ScalarTy, VF); + + // Vectorize the interleaved store. + for (unsigned Part = 0; Part < UF; Part++) { + // Collect the stored vector for each member. + SmallVector StoredVecs; + for (auto I : Members) { + Value *StoredVec = + getVectorValue(dyn_cast(I.first)->getValueOperand())[Part]; + + if (IA->isReverse()) + StoredVec = reverseVector(StoredVec); + + // If this member has different type, cast it to a unified type. + if (StoredVec->getType() != MBVecTy) + StoredVec = Builder.CreateBitOrPointerCast(StoredVec, MBVecTy); + + StoredVecs.push_back(StoredVec); + } + // Concatenate all vectors into a wide vector. + Value *WideVec = ConcatenateVectors(Builder, StoredVecs); + + // Interleave the elements in the wide vector. + Constant *IMask = getInterleavedStoreMask(Builder, VF, Stride); + Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask, + "interleaved.vec"); + + // Store the interleaved wide vector. + Instruction *WideStore = + Builder.CreateAlignedStore(IVec, NewPtrs[Part], IA->getAlign()); + propagateMetadata(WideStore, Instr); + } +} + void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); @@ -1664,6 +1922,10 @@ assert((LI || SI) && "Invalid Load/Store instruction"); + // Try to vectorize the interleaved access. + if (Legal->isInterleaved(Instr)) + return vectorizeInterleavedAccess(Instr); + Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); Type *DataTy = VectorType::get(ScalarDataTy, VF); Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); @@ -4575,6 +4837,24 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS); + // Interleaved load/store + if (Legal->isInterleaved(I)) { + auto IA = Legal->getInterleavedAccess(I); + Type *WideVecTy = + VectorType::get(VectorTy->getVectorElementType(), + VectorTy->getVectorNumElements() * IA->getStride()); + + // Calculate the cost of the interleaved access for this member. + unsigned Cost = TTI.getInterleavedMemoryOpCost( + I->getOpcode(), WideVecTy, VectorTy, IA->getIndex(I), + IA->getNumGaps(), IA->getAlign(), AS); + + if (IA->isReverse()) + Cost += + TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0); + return Cost; + } + // Scalarized loads/stores. int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; Index: test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll +++ test/Transforms/LoopVectorize/AArch64/arbitrary-induction-step.ll @@ -1,8 +1,7 @@ -; RUN: opt -S < %s -loop-vectorize 2>&1 | FileCheck %s -; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 | FileCheck %s --check-prefix=FORCE-VEC +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -analyze-interleaved-access=true 2>&1 | FileCheck %s +; RUN: opt -S < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -analyze-interleaved-access=true | FileCheck %s --check-prefix=FORCE-VEC target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" -target triple = "aarch64--linux-gnueabi" ; Test integer induction variable of step 2: ; for (int i = 0; i < 1024; i+=2) { @@ -102,26 +101,23 @@ ; } ; CHECK-LABEL: @ptr_ind_plus2( -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: load i32, i32* -; CHECK: mul nsw i32 -; CHECK: mul nsw i32 -; CHECK: add nsw i32 -; CHECK: add nsw i32 -; CHECK: %index.next = add i64 %index, 2 -; CHECK: %21 = icmp eq i64 %index.next, 1024 +; CHECK: %[[V0:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V0]], <8 x i32> undef, <4 x i32> +; CHECK: %[[V1:.*]] = load <8 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: %index.next = add i64 %index, 8 +; CHECK: icmp eq i64 %index.next, 1024 ; FORCE-VEC-LABEL: @ptr_ind_plus2( -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> -; FORCE-VEC: load i32, i32* -; FORCE-VEC: insertelement <2 x i32> +; FORCE-VEC: %[[V:.*]] = load <4 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> +; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> ; FORCE-VEC: mul nsw <2 x i32> ; FORCE-VEC: add nsw <2 x i32> ; FORCE-VEC: %index.next = add i64 %index, 2 Index: test/Transforms/LoopVectorize/interleaved-access.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/interleaved-access.ll @@ -0,0 +1,602 @@ +; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -analyze-interleaved-access=true -runtime-memory-check-threshold=24 < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" + +; Make sure that we can vectorize the interleave access to a struct. +; struct ST2 { +; int x; +; int y; +; }; +; int test_struct_load2(struct ST2 *A) { +; int sum = 0; +; for (int i = 0; i < 1024; ++i) { +; sum += A[i].x; +; sum += A[i].y; +; } +; +; return sum; +; } + +; CHECK-LABEL: @test_struct_load2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> + +%struct.ST2 = type { i32, i32 } +define i32 @test_struct_load2(%struct.ST2* nocapture %A) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %add = add nsw i32 %0, %sum + %add1 = add nsw i32 %add, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %add1.lcssa = phi i32 [ %add1, %for.body ] + ret i32 %add1.lcssa +} + + +; Make sure that the following case can be vectorized. +; int AB[1024]; +; int CD[1024]; +; void test_array_load2_store2(int C, int D) { +; for (int i = 0; i < 1024; i+=2) { +; int A = AB[i]; +; int B = AB[i+1]; +; CD[i] = A + C; +; CD[i+1] = B * D; +; } +; } +; CHECK-LABEL: @test_array_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: mul nsw <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +@AB = common global [1024 x i32] zeroinitializer, align 4 +@CD = common global [1024 x i32] zeroinitializer, align 4 + +define void @test_array_load2_store2(i32 %C, i32 %D) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv + %0 = load i32, i32* %arrayidx0, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %1 + %2 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %0, %C + %mul = mul nsw i32 %2, %D + %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv + store i32 %add, i32* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %1 + store i32 %mul, i32* %arrayidx3, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp slt i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +; Make sure that the following case can be vectorized. +; struct ST3{ +; int x; +; int y; +; int z; +; }; +; int test_struct_load3(struct ST3 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load3( +; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> + +%struct.ST3 = type { i32, i32, i32 } + +define i32 @test_struct_load3(%struct.ST3* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.015 = phi i32 [ 0, %entry ], [ %add5, %for.body ] + %x = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.015 + %y = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST3, %struct.ST3* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %add5 +} + + +; Make sure that the following case can be vectorized. +; int A[3072]; +; struct ST S[1024]; +; void test_struct_st3() { +; int *ptr = A; +; for (int i = 0; i < 1024; i++) { +; int X1 = *ptr++; +; int X2 = *ptr++; +; int X3 = *ptr++; +; T[i].x = X1 + 1; +; T[i].y = X2 + 2; +; T[i].z = X3 + 3; +; } +; } + +; CHECK-LABEL: @test_struct_array_load3_store3( +; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: add nsw <4 x i32> {{.*}}, +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> +; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4 + +@A = common global [3072 x i32] zeroinitializer, align 4 +@S = common global [1024 x %struct.ST3] zeroinitializer, align 4 + +define void @test_struct_array_load3_store3() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1 + %0 = load i32, i32* %ptr.016, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2 + %1 = load i32, i32* %incdec.ptr, align 4 + %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3 + %2 = load i32, i32* %incdec.ptr1, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %add3 = add nsw i32 %1, 2 + %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1 + store i32 %add3, i32* %y, align 4 + %add6 = add nsw i32 %2, 3 + %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2 + store i32 %add6, i32* %z, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; Make sure that the following case can be vectorized. +; struct ST3{ +; int x; +; int y; +; int z; +; int w; +; }; +; int test_struct_load4(struct ST3 *S) { +; int r = 0; +; for (int i = 0; i < 1024; i++) { +; r += S[i].x; +; r -= S[i].y; +; r += S[i].z; +; r -= S[i].w; +; } +; return r; +; } + +; CHECK-LABEL: @test_struct_load4( +; CHECK: %wide.vec = load <16 x i32>, <16 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub <4 x i32> + +%struct.ST4 = type { i32, i32, i32, i32 } + +define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ] + %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add = add nsw i32 %0, %r.022 + %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub = sub i32 %add, %1 + %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2 + %2 = load i32, i32* %z, align 4 + %add5 = add nsw i32 %sub, %2 + %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3 + %3 = load i32, i32* %w, align 4 + %sub8 = sub i32 %add5, %3 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret i32 %sub8 +} + + +; int B[1024]; +; struct ST T[1024]; +; void test_struct_store4() { +; int *ptr = B; +; for (int i = 0; i < 1024; i++) { +; int X = *ptr++; +; T[i].x = X + 1; +; T[i].y = X * 2; +; T[i].z = X + 3; +; T[i].w = X + 4; +; } +; } + +; CHECK-LABEL: @test_struct_store4( +; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>* +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shl nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: add nsw <4 x i32> %[[LD]], +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> +; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4 + +@B = common global [1024 x i32] zeroinitializer, align 4 +@T = common global [1024 x %struct.ST4] zeroinitializer, align 4 + +define void @test_struct_store4() { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %ptr.017 = phi i32* [ getelementptr inbounds ([1024 x i32], [1024 x i32]* @B, i64 0, i64 0), %entry ], [ %incdec.ptr, %for.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %ptr.017, i64 1 + %0 = load i32, i32* %ptr.017, align 4 + %add = add nsw i32 %0, 1 + %x = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x, align 4 + %mul = shl nsw i32 %0, 1 + %y = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 1 + store i32 %mul, i32* %y, align 4 + %sub = add nsw i32 %0, 3 + %z = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 2 + store i32 %sub, i32* %z, align 4 + %add5 = add nsw i32 %0, 4 + %w = getelementptr inbounds [1024 x %struct.ST4], [1024 x %struct.ST4]* @T, i64 0, i64 %indvars.iv, i32 3 + store i32 %add5, i32* %w, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; This case checks that reversed interleaved loads can be vectorized: +; struct ST2 { +; int x; +; int y; +; }; +; int test_reversed_load2(struct ST2 *A) { +; int add = 0; +; int sub = 0; +; for (int i = 1023; i >= 0; i--) { +; add += A[i].x; +; sub -= A[i].y; +; } +; return add*sub; +; } + +; CHECK-LABEL: @test_reversed_load2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub nsw <4 x i32> + +define i32 @test_reversed_load2(%struct.ST2* nocapture readonly %A) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + %mul = mul nsw i32 %sub4, %add1 + ret i32 %mul + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %sub.015 = phi i32 [ 0, %entry ], [ %sub4, %for.body ] + %add.014 = phi i32 [ 0, %entry ], [ %add1, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %add1 = add nsw i32 %0, %add.014 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %1 = load i32, i32* %y, align 4 + %sub4 = sub nsw i32 %sub.015, %1 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; This case checks that reversed interleaved loads/stores can be vectorized: +; struct ST2 { +; int x; +; int y; +; }; +; +; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) { +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; // interleaved load of index 0 +; int b = A[i].y - i; // interleaved load of index 1 +; B[i].x = a; // interleaved store of index 0 +; B[i].y = b; // interleaved store of index 1 +; } +; } + +; CHECK-LABEL: @test_reversed_load2_store2( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: sub nsw <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4 + +define void @test_reversed_load2_store2(%struct.ST2* nocapture readonly %A, %struct.ST2* nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %x, align 4 + %1 = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %0, %1 + %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1 + %2 = load i32, i32* %y, align 4 + %sub = sub nsw i32 %2, %1 + %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0 + store i32 %add, i32* %x5, align 4 + %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1 + store i32 %sub, i32* %y8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check the load of odd elements of A can be vectorized as interleaved access. +; void odd_load(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) +; B[i/2] = A[i] * 2; +; } + +; CHECK-LABEL: @odd_load( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK-NOT: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shl nsw <4 x i32> %strided.vec, +define void @odd_load(i32* nocapture readonly %A, i32* nocapture %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %1 = lshr exact i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1 + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check the store of odd elements of A can not be vectorized as interleaved +; access. BTW, the interleaved load of B still can be vectorized as interleaved +; access +; void odd_store(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) +; A[i] = B[i] + B[i+1]; +; } + +; CHECK-LABEL: @odd_store( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK-NOT: store <8 x i32> +; CHECK: extractelement <4 x i32> +; CHECK: store i32 +; CHECK: extractelement <4 x i32> +; CHECK: store i32 +; CHECK: extractelement <4 x i32> +; CHECK: store i32 +; CHECK: extractelement <4 x i32> +; CHECK: store i32 +define void @odd_store(i32* nocapture %A, i32* nocapture readonly %B) { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %add3 = add nsw i32 %2, %0 + %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + store i32 %add3, i32* %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check that mixed interleaved loads/stores can be vectorized. +; void mixed_load_store(int *A, int *B) { +; for (unsigned i = 0; i < 1024; i+=2) { +; B[i] = A[i] * A[i+1]; +; B[i+1] = A[i] + A[i+1]; +; } +; } + +; CHECK-LABEL: @mixed_load_store( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4 +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> +; CHECK: store <8 x i32> %interleaved.vec +define void @mixed_load_store(i32* nocapture readonly %A, i32* nocapture %B) #0 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %1 = or i64 %indvars.iv, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %1 + %2 = load i32, i32* %arrayidx2, align 4 + %mul = mul nsw i32 %2, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + store i32 %mul, i32* %arrayidx4, align 4 + %3 = load i32, i32* %arrayidx, align 4 + %4 = load i32, i32* %arrayidx2, align 4 + %add10 = add nsw i32 %4, %3 + %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %1 + store i32 %add10, i32* %arrayidx13, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2 + %cmp = icmp ult i64 %indvars.iv.next, 1024 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +; Check that interleaved loads of different types can be vectorized. As the int +; value is index 0 of interleaved access. The interleaved load uses a integer +; vector pointer. +; struct IntFloat { +; int a; +; float b; +; }; +; +; int SA; +; float SB; +; +; void int_float_struct(struct IntFloat *A) { +; int SumA; +; float SumB; +; for (unsigned i = 0; i < 1024; i++) { +; SumA += A[i].a; +; SumB += A[i].b; +; } +; SA = SumA; +; SB = SumB; +; } + +; CHECK-LABEL: @int_float_struct( +; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4 +; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> +; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float> +; CHECK: add nsw <4 x i32> +; CHECK: fadd fast <4 x float> + +%struct.IntFloat = type { i32, float } + +@SA = common global i32 0, align 4 +@SB = common global float 0.000000e+00, align 4 + +define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + store i32 %add, i32* @SA, align 4 + store float %add3, float* @SB, align 4 + ret void + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ] + %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ] + %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0 + %0 = load i32, i32* %a, align 4 + %add = add nsw i32 %0, %SumA.013 + %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1 + %1 = load float, float* %b, align 4 + %add3 = fadd fast float %SumB.014, %1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { "unsafe-fp-math"="true" }