Index: include/llvm/Analysis/VectorUtils.h =================================================================== --- include/llvm/Analysis/VectorUtils.h +++ include/llvm/Analysis/VectorUtils.h @@ -50,8 +50,11 @@ /// \brief Find the operand of the GEP that should be checked for consecutive /// stores. This ignores trailing indices that have no effect on the final -/// pointer. -unsigned getGEPInductionOperand(const GetElementPtrInst *Gep); +/// pointer. If \p Gep is not the last level of GEP computing store address, +/// the mem access type of store -- \p MemAccessTy should be used instead +/// of Gep's result type. +unsigned getGEPInductionOperand(const GetElementPtrInst *Gep, + Type *MemAccessTy = nullptr); /// \brief If the argument is a GEP, then returns the operand identified by /// getGEPInductionOperand. However, if there is some other non-loop-invariant Index: lib/Analysis/VectorUtils.cpp =================================================================== --- lib/Analysis/VectorUtils.cpp +++ lib/Analysis/VectorUtils.cpp @@ -98,10 +98,12 @@ /// \brief Find the operand of the GEP that should be checked for consecutive /// stores. This ignores trailing indices that have no effect on the final /// pointer. -unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) { +unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep, + Type *MemAccessTy) { const DataLayout &DL = Gep->getModule()->getDataLayout(); unsigned LastOperand = Gep->getNumOperands() - 1; - unsigned GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType()); + unsigned GEPAllocSize = DL.getTypeAllocSize( + MemAccessTy ? MemAccessTy : Gep->getResultElementType()); // Walk backwards and try to peel off zeros. while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) { Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1330,7 +1330,7 @@ /// 0 - Stride is unknown or non-consecutive. /// 1 - Address is consecutive. /// -1 - Address is consecutive, and decreasing. - int isConsecutivePtr(Value *Ptr); + int isConsecutivePtr(Value *Ptr, Type *MemAccessTy = nullptr); /// Returns true if the value V is uniform within the loop. bool isUniform(Value *V); @@ -2169,11 +2169,23 @@ return Builder.CreateAdd(Val, Step, "induction"); } -int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { +int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr, Type *MemAccessTy) { assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); auto *SE = PSE.getSE(); - // Make sure that the pointer does not point to structs. - if (Ptr->getType()->getPointerElementType()->isAggregateType()) + // If MemAccessTy is nullptr, assuming we are checking a Ptr directly used + // as load/store address, so the pointer cannot point to structs. + if (!MemAccessTy && + Ptr->getType()->getPointerElementType()->isAggregateType()) + return 0; + + // If MemAccessTy is not nullptr, we are checking a Ptr indirectly used as + // load/store address. + // Make sure Ptr's element type has the same size as MemAccessTy, so if only + // Ptr is consecutive, it is consecutive relative to MemAccessTy. + auto &DL = TheLoop->getHeader()->getModule()->getDataLayout(); + if (MemAccessTy && + DL.getTypeAllocSize(MemAccessTy) != + DL.getTypeAllocSize(Ptr->getType()->getPointerElementType())) return 0; // If this value is a pointer induction variable, we know it is consecutive. @@ -2196,8 +2208,9 @@ // Make sure that the pointer does not point to structs. PointerType *GepPtrType = cast(GpPtr->getType()); - if (GepPtrType->getElementType()->isAggregateType()) + if (GepPtrType->getElementType()->isAggregateType()) { return 0; + } // Make sure that all of the index operands are loop invariant. for (unsigned i = 1; i < NumOperands; ++i) @@ -2208,7 +2221,18 @@ return II.getConsecutiveDirection(); } - unsigned InductionOperand = getGEPInductionOperand(Gep); + // If GpPtr is a consecutive pointer relative to MemAccessTy and + // all of the GEP indices are constant, then we know it is consecutive. + if (int Direction = isConsecutivePtr( + GpPtr, MemAccessTy ? MemAccessTy : Gep->getResultElementType())) { + // Make sure that all of the index operands are loop invariant. + for (unsigned i = 1; i < NumOperands; ++i) + if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop)) + return 0; + return Direction; + } + + unsigned InductionOperand = getGEPInductionOperand(Gep, MemAccessTy); // Check that all of the gep indices are uniform except for our induction // operand. @@ -2586,7 +2610,9 @@ // Handle consecutive loads/stores. GetElementPtrInst *Gep = getGEPInstruction(Ptr); if (ConsecutiveStride) { - if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { + if (Gep && + Legal->isConsecutivePtr(Gep->getPointerOperand(), + Gep->getResultElementType())) { setDebugLocFromInst(Builder, Gep); Value *PtrOperand = Gep->getPointerOperand(); Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; Index: test/Transforms/LoopVectorize/consecutive-ptr.ll =================================================================== --- test/Transforms/LoopVectorize/consecutive-ptr.ll +++ test/Transforms/LoopVectorize/consecutive-ptr.ll @@ -0,0 +1,63 @@ +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s + +; Make sure loop vectorizer knows the store address is consecutive and generates wide store. +; CHECK-LABEL: @foo( +; CHECK: store <4 x i32> + +define void @foo(i32 %N, i32* nocapture %ps) #0 { +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %add.ptr = getelementptr inbounds i32, i32* %ps, i64 %indvars.iv + %add.ptr1 = getelementptr inbounds i32, i32* %add.ptr, i64 1 + %tmp0 = trunc i64 %indvars.iv to i32 + store i32 %tmp0, i32* %add.ptr1, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +; Make sure loop vectorizer knows the store address is consecutive and generates wide store. +; CHECK-LABEL: @goo( +; CHECK: store <4 x i32> + +%struct.B = type { i32 } +define void @goo(i32 %N, %struct.B* %ps) #0 { +entry: + %cmp6 = icmp sgt i32 %N, 0 + br i1 %cmp6, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %arrayidx16 = getelementptr inbounds %struct.B, %struct.B* %ps, i64 %indvars.iv + %ival = getelementptr inbounds %struct.B, %struct.B* %arrayidx16, i64 0, i32 0 + %tmp1 = trunc i64 %indvars.iv to i32 + store i32 %tmp1, i32* %ival, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.end.loopexit, label %for.body + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} +