Diff 70737

llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 293 Lines • ▼ Show 20 Lines
static Value getPointerOperand(Value I) {		static Value getPointerOperand(Value I) {
if (auto *LI = dyn_cast<LoadInst>(I))		if (auto *LI = dyn_cast<LoadInst>(I))
return LI->getPointerOperand();		return LI->getPointerOperand();
if (auto *SI = dyn_cast<StoreInst>(I))		if (auto *SI = dyn_cast<StoreInst>(I))
return SI->getPointerOperand();		return SI->getPointerOperand();
return nullptr;		return nullptr;
}		}

		/// A helper function that returns true if the given type is irregular. The
		/// type is irregular if its allocated size doesn't equal the store size of an
		/// element of the corresponding vector type at the given vectorization factor.
		static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {

		// Determine if an array of VF elements of type Ty is "bitcast compatible"
		// with a <VF x Ty> vector.
		if (VF > 1) {
		auto *VectorTy = VectorType::get(Ty, VF);
		return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
		}

		// If the vectorization factor is one, we just check if an array of type Ty
		// requires padding between elements.
		return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
		}

/// InnerLoopVectorizer vectorizes loops which contain only one basic		/// InnerLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).		/// block to a specified vectorization factor (VF).
/// This class performs the widening of scalars into vectors, or multiple		/// This class performs the widening of scalars into vectors, or multiple
/// scalars. This class also implements the following features:		/// scalars. This class also implements the following features:
/// * It inserts an epilogue loop for handling loops that don't have iteration		/// * It inserts an epilogue loop for handling loops that don't have iteration
/// counts that are known to be a multiple of the vectorization factor.		/// counts that are known to be a multiple of the vectorization factor.
/// * It handles the code generation for reduction variables.		/// * It handles the code generation for reduction variables.
/// * Scalarization (implementation using scalars) of un-vectorizable		/// * Scalarization (implementation using scalars) of un-vectorizable
▲ Show 20 Lines • Show All 1,296 Lines • ▼ Show 20 Lines	public:

/// Returns true if vector representation of the instruction \p I		/// Returns true if vector representation of the instruction \p I
/// requires mask.		/// requires mask.
bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }		bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); }
unsigned getNumStores() const { return LAI->getNumStores(); }		unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }		unsigned getNumLoads() const { return LAI->getNumLoads(); }
unsigned getNumPredStores() const { return NumPredStores; }		unsigned getNumPredStores() const { return NumPredStores; }

		/// Returns true if \p I is a store instruction in a predicated block that
		/// will be scalarized during vectorization.
		bool isPredicatedStore(Instruction *I);

		/// Returns true if \p I is a memory instruction that has a consecutive or
		/// consecutive-like pointer operand. Consecutive-like pointers are pointers
		/// that are treated like consecutive pointers during vectorization. The
		/// pointer operands of interleaved accesses are an example.
		bool hasConsecutiveLikePtrOperand(Instruction *I);

		/// Returns true if \p I is a memory instruction that must be scalarized
		/// during vectorization.
		bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1);

private:		private:
/// Check if a single basic block loop is vectorizable.		/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count		/// At this point we know that this is a loop with a constant trip count
/// and we only need to check individual instructions.		/// and we only need to check individual instructions.
bool canVectorizeInstrs();		bool canVectorizeInstrs();

/// When we vectorize loops we may change the order in which		/// When we vectorize loops we may change the order in which
/// we read and write from memory. This method checks if it is		/// we read and write from memory. This method checks if it is
▲ Show 20 Lines • Show All 1,106 Lines • ▼ Show 20 Lines	void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Value *Ptr = getPointerOperand(Instr);		Value *Ptr = getPointerOperand(Instr);
unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();		unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
// An alignment of 0 means target abi alignment. We need to use the scalar's		// An alignment of 0 means target abi alignment. We need to use the scalar's
// target abi alignment in such a case.		// target abi alignment in such a case.
const DataLayout &DL = Instr->getModule()->getDataLayout();		const DataLayout &DL = Instr->getModule()->getDataLayout();
if (!Alignment)		if (!Alignment)
Alignment = DL.getABITypeAlignment(ScalarDataTy);		Alignment = DL.getABITypeAlignment(ScalarDataTy);
unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();		unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ScalarDataTy);
uint64_t VectorElementSize = DL.getTypeStoreSize(DataTy) / VF;

if (SI && Legal->blockNeedsPredication(SI->getParent()) &&		// Scalarize the memory instruction if necessary.
!Legal->isMaskRequired(SI))		if (Legal->memoryInstructionMustBeScalarized(Instr, VF))
return scalarizeInstruction(Instr, true);		return scalarizeInstruction(Instr, Legal->isPredicatedStore(Instr));

if (ScalarAllocatedSize != VectorElementSize)
return scalarizeInstruction(Instr);

// If the pointer is loop invariant scalarize the load.
if (LI && Legal->isUniform(Ptr))
return scalarizeInstruction(Instr);

// If the pointer is non-consecutive and gather/scatter is not supported		// Determine if the pointer operand of the access is either consecutive or
// scalarize the instruction.		// reverse consecutive.
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);		int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
bool Reverse = ConsecutiveStride < 0;		bool Reverse = ConsecutiveStride < 0;
bool CreateGatherScatter =
!ConsecutiveStride && ((LI && Legal->isLegalMaskedGather(ScalarDataTy)) \|\|
(SI && Legal->isLegalMaskedScatter(ScalarDataTy)));

if (!ConsecutiveStride && !CreateGatherScatter)		// Determine if either a gather or scatter operation is legal.
return scalarizeInstruction(Instr);		bool CreateGatherScatter =
		!ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr);

VectorParts VectorGep;		VectorParts VectorGep;

// Handle consecutive loads/stores.		// Handle consecutive loads/stores.
GetElementPtrInst *Gep = getGEPInstruction(Ptr);		GetElementPtrInst *Gep = getGEPInstruction(Ptr);
if (ConsecutiveStride) {		if (ConsecutiveStride) {
if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {		if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
setDebugLocFromInst(Builder, Gep);		setDebugLocFromInst(Builder, Gep);
▲ Show 20 Lines • Show All 2,500 Lines • ▼ Show 20 Lines	if (!ScalarIndUpdate)
continue;		continue;

// The induction variable and its update instruction will remain scalar.		// The induction variable and its update instruction will remain scalar.
Scalars.insert(Ind);		Scalars.insert(Ind);
Scalars.insert(IndUpdate);		Scalars.insert(IndUpdate);
}		}
}		}

		bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) {
		if (isAccessInterleaved(I))
		return true;
		if (auto *Ptr = getPointerOperand(I))
		return isConsecutivePtr(Ptr);
		return false;
		}

		bool LoopVectorizationLegality::isPredicatedStore(Instruction *I) {
		auto *SI = dyn_cast<StoreInst>(I);
		return SI && blockNeedsPredication(SI->getParent()) && !isMaskRequired(SI);
		}

		bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(
		Instruction *I, unsigned VF) {

		// If the memory instruction is in an interleaved group, it will be
		// vectorized and its pointer will remain uniform.
		if (isAccessInterleaved(I))
		return false;

		// Get and ensure we have a valid memory instruction.
		LoadInst *LI = dyn_cast<LoadInst>(I);
		StoreInst *SI = dyn_cast<StoreInst>(I);
		assert((LI \|\| SI) && "Invalid memory instruction");

		// If the pointer operand is uniform (loop invariant), the memory instruction
		// will be scalarized.
		auto *Ptr = getPointerOperand(I);
		if (LI && isUniform(Ptr))
		return true;

		// If the pointer operand is non-consecutive and neither a gather nor a
		// scatter operation is legal, the memory instruction will be scalarized.
		if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I))
		return true;

		// If the instruction is a store located in a predicated block, it will be
		// scalarized.
		if (isPredicatedStore(I))
		return true;

		// If the instruction's allocated size doesn't equal it's type size, it
		// requires padding and will be scalarized.
		auto &DL = I->getModule()->getDataLayout();
		auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
		if (hasIrregularType(ScalarTy, DL, VF))
		return true;

		// Otherwise, the memory instruction should be vectorized if the rest of the
		// loop is.
		return false;
		}

void LoopVectorizationLegality::collectLoopUniforms() {		void LoopVectorizationLegality::collectLoopUniforms() {
// We now know that the loop is vectorizable!		// We now know that the loop is vectorizable!
// Collect instructions inside the loop that will remain uniform after		// Collect instructions inside the loop that will remain uniform after
// vectorization.		// vectorization.

// Global values, params and instructions outside of current loop are out of		// Global values, params and instructions outside of current loop are out of
// scope.		// scope.
auto isOutOfScope = [&](Value *V) -> bool {		auto isOutOfScope = [&](Value *V) -> bool {
Instruction *I = dyn_cast<Instruction>(V);		Instruction *I = dyn_cast<Instruction>(V);
return (!I \|\| !TheLoop->contains(I));		return (!I \|\| !TheLoop->contains(I));
};		};

SetVector<Instruction *> Worklist;		SetVector<Instruction *> Worklist;
BasicBlock *Latch = TheLoop->getLoopLatch();		BasicBlock *Latch = TheLoop->getLoopLatch();
// Start with the conditional branch.		// Start with the conditional branch.
if (!isOutOfScope(Latch->getTerminator()->getOperand(0))) {		if (!isOutOfScope(Latch->getTerminator()->getOperand(0))) {
Instruction *Cmp = cast<Instruction>(Latch->getTerminator()->getOperand(0));		Instruction *Cmp = cast<Instruction>(Latch->getTerminator()->getOperand(0));
Worklist.insert(Cmp);		Worklist.insert(Cmp);
DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");		DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
}		}

// Add all consecutive pointer values; these values will be uniform after		// Holds consecutive and consecutive-like pointers. Consecutive-like pointers
// vectorization (and subsequent cleanup). Although non-consecutive, we also		// are pointers that are treated like consecutive pointers during
// add the pointer operands of interleaved accesses since they are treated		// vectorization. The pointer operands of interleaved accesses are an
// like consecutive pointers during vectorization.		// example.
		SmallPtrSet<Instruction *, 8> ConsecutiveLikePtrs;

		// Holds pointer operands of instructions that are possibly non-uniform.
		SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;

		// Iterate over the instructions in the loop, and collect all
		// consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
		// that a consecutive-like pointer operand will be scalarized, we collect it
		// in PossibleNonUniformPtrs instead. We use two sets here because a single
		// getelementptr instruction can be used by both vectorized and scalarized
		// memory instructions. For example, if a loop loads and stores from the same
		// location, but the store is conditional, the store will be scalarized, and
		// the getelementptr won't remain uniform.
for (auto *BB : TheLoop->blocks())		for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {		for (auto &I : *BB) {
Instruction *Ptr = nullptr;
if (I.getType()->isPointerTy() && isConsecutivePtr(&I))		// If the pointer operand is not consecutive-like, there's nothing to do.
Ptr = &I;		auto *Ptr = dyn_cast_or_null<Instruction>(getPointerOperand(&I));
else if (isAccessInterleaved(&I))		if (!Ptr \|\| isUniform(Ptr) \|\| !hasConsecutiveLikePtrOperand(&I))
Ptr = cast<Instruction>(getPointerOperand(&I));
else
continue;		continue;
Worklist.insert(Ptr);
DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ptr << "\n");		// Ensure the memory instruction will not be scalarized, making its
		// pointer operand non-uniform.
		if (memoryInstructionMustBeScalarized(&I))
		PossibleNonUniformPtrs.insert(Ptr);

		// If the memory instruction will be vectorized, its consecutive-like
		// pointer operand should remain uniform.
		else
		ConsecutiveLikePtrs.insert(Ptr);
		}

		// Add to the Worklist all consecutive and consecutive-like pointers that
		// aren't also identified as possibly non-uniform.
		for (auto *V : ConsecutiveLikePtrs)
		if (!PossibleNonUniformPtrs.count(V)) {
		DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
		Worklist.insert(V);
}		}

// Expand Worklist in topological order: whenever a new instruction		// Expand Worklist in topological order: whenever a new instruction
// is added , its users should be either already inside Worklist, or		// is added , its users should be either already inside Worklist, or
// out of scope. It ensures a uniform instruction will only be used		// out of scope. It ensures a uniform instruction will only be used
// by uniform instructions or out of scope instructions.		// by uniform instructions or out of scope instructions.
unsigned idx = 0;		unsigned idx = 0;
while (idx != Worklist.size()) {		while (idx != Worklist.size()) {
▲ Show 20 Lines • Show All 1,194 Lines • ▼ Show 20 Lines	if (Legal->isAccessInterleaved(I)) {
TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);		TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);

// FIXME: The interleaved load group with a huge gap could be even more		// FIXME: The interleaved load group with a huge gap could be even more
// expensive than scalar operations. Then we could ignore such group and		// expensive than scalar operations. Then we could ignore such group and
// use scalar operations instead.		// use scalar operations instead.
return Cost;		return Cost;
}		}

// Scalarized loads/stores.		// Check if the memory instruction will be scalarized.
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);		if (Legal->memoryInstructionMustBeScalarized(I, VF)) {
bool UseGatherOrScatter =
(ConsecutiveStride == 0) && Legal->isLegalGatherOrScatter(I);

bool Reverse = ConsecutiveStride < 0;
const DataLayout &DL = I->getModule()->getDataLayout();
uint64_t ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
uint64_t VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
if ((!ConsecutiveStride && !UseGatherOrScatter) \|\|
ScalarAllocatedSize != VectorElementSize) {
bool IsComplexComputation =		bool IsComplexComputation =
isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);		isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
unsigned Cost = 0;		unsigned Cost = 0;
// The cost of extracting from the value vector and pointer vector.		// The cost of extracting from the value vector and pointer vector.
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);		Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
for (unsigned i = 0; i < VF; ++i) {		for (unsigned i = 0; i < VF; ++i) {
// The cost of extracting the pointer operand.		// The cost of extracting the pointer operand.
Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);		Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
// In case of STORE, the cost of ExtractElement from the vector.		// In case of STORE, the cost of ExtractElement from the vector.
// In case of LOAD, the cost of InsertElement into the returned		// In case of LOAD, the cost of InsertElement into the returned
// vector.		// vector.
Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement		Cost += TTI.getVectorInstrCost(SI ? Instruction::ExtractElement
: Instruction::InsertElement,		: Instruction::InsertElement,
VectorTy, i);		VectorTy, i);
}		}

// The cost of the scalar loads/stores.		// The cost of the scalar loads/stores.
Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);		Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
Cost += VF *		Cost += VF *
TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),		TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
Alignment, AS);		Alignment, AS);
return Cost;		return Cost;
}		}

		// Determine if the pointer operand of the access is either consecutive or
		// reverse consecutive.
		int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
		bool Reverse = ConsecutiveStride < 0;

		// Determine if either a gather or scatter operation is legal.
		bool UseGatherOrScatter =
		!ConsecutiveStride && Legal->isLegalGatherOrScatter(I);

unsigned Cost = TTI.getAddressComputationCost(VectorTy);		unsigned Cost = TTI.getAddressComputationCost(VectorTy);
if (UseGatherOrScatter) {		if (UseGatherOrScatter) {
assert(ConsecutiveStride == 0 &&		assert(ConsecutiveStride == 0 &&
"Gather/Scatter are not used for consecutive stride");		"Gather/Scatter are not used for consecutive stride");
return Cost +		return Cost +
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,		TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
Legal->isMaskRequired(I), Alignment);		Legal->isMaskRequired(I), Alignment);
}		}
▲ Show 20 Lines • Show All 553 Lines • Show Last 20 Lines

llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll

				; REQUIRES: asserts
				; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 \| FileCheck %s
				; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-cond-stores-vec -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 \| FileCheck %s --check-prefix=INTER

				target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"

				%pair = type { i32, i32 }

				; CHECK-LABEL: consecutive_ptr_forward
				;
				; Check that a forward consecutive pointer is recognized as uniform and remains
				; uniform after vectorization.
				;
				; CHECK: LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
				; CHECK: vector.body
				; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				; CHECK-NOT: getelementptr
				; CHECK: getelementptr inbounds i32, i32* %a, i64 %index
				; CHECK-NOT: getelementptr
				; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
				;
				define i32 @consecutive_ptr_forward(i32* %a, i64 %n) {
				entry:
				br label %for.body

				for.body:
				%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
				%tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
				%tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
				%tmp2 = load i32, i32* %tmp1, align 8
				%tmp3 = add i32 %tmp0, %tmp2
				%i.next = add nuw nsw i64 %i, 1
				%cond = icmp slt i64 %i.next, %n
				br i1 %cond, label %for.body, label %for.end

				for.end:
				%tmp4 = phi i32 [ %tmp3, %for.body ]
				ret i32 %tmp4
				}

				; CHECK-LABEL: consecutive_ptr_reverse
				;
				; Check that a reverse consecutive pointer is recognized as uniform and remains
				; uniform after vectorization.
				;
				; CHECK: LV: Found uniform instruction: %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
				; CHECK: vector.body
				; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				; CHECK: %offset.idx = sub i64 %n, %index
				; CHECK-NOT: getelementptr
				; CHECK: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %offset.idx
				; CHECK: getelementptr i32, i32* %[[G0]], i64 -3
				; CHECK-NOT: getelementptr
				; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
				;
				define i32 @consecutive_ptr_reverse(i32* %a, i64 %n) {
				entry:
				br label %for.body

				for.body:
				%i = phi i64 [ %i.next, %for.body ], [ %n, %entry ]
				%tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
				%tmp1 = getelementptr inbounds i32, i32* %a, i64 %i
				%tmp2 = load i32, i32* %tmp1, align 8
				%tmp3 = add i32 %tmp0, %tmp2
				%i.next = add nuw nsw i64 %i, -1
				%cond = icmp sgt i64 %i.next, 0
				br i1 %cond, label %for.body, label %for.end

				for.end:
				%tmp4 = phi i32 [ %tmp3, %for.body ]
				ret i32 %tmp4
				}
				; CHECK-LABEL: interleaved_access_forward
				;
				; Check that a consecutive-like pointer used by a forward interleaved group is
				; recognized as uniform and remains uniform after vectorization. When
				; interleaved memory accesses aren't enabled, the pointer should not be
				; recognized as uniform, and it should not be uniform after vectorization.
				;
				; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
				; CHECK-NOT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
				; CHECK: vector.body
				; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				; CHECK: %[[I1:.+]] = or i64 %index, 1
				; CHECK: %[[I2:.+]] = or i64 %index, 2
				; CHECK: %[[I3:.+]] = or i64 %index, 3
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %index, i32 1
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 1
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 1
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 1
				; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
				;
				; INTER: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
				; INTER: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
				; INTER: vector.body
				; INTER: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				; INTER-NOT: getelementptr
				; INTER: getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
				; INTER-NOT: getelementptr
				; INTER: br i1 {{.*}}, label %middle.block, label %vector.body
				;
				define i32 @interleaved_access_forward(%pair* %p, i64 %n) {
				entry:
				br label %for.body

				for.body:
				%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
				%tmp0 = phi i32 [ %tmp6, %for.body ], [ 0, %entry ]
				%tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
				%tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
				%tmp3 = load i32, i32* %tmp1, align 8
				%tmp4 = load i32, i32* %tmp2, align 8
				%tmp5 = add i32 %tmp3, %tmp4
				%tmp6 = add i32 %tmp0, %tmp5
				%i.next = add nuw nsw i64 %i, 1
				%cond = icmp slt i64 %i.next, %n
				br i1 %cond, label %for.body, label %for.end

				for.end:
				%tmp14 = phi i32 [ %tmp6, %for.body ]
				ret i32 %tmp14
				}

				; CHECK-LABEL: interleaved_access_reverse
				;
				; Check that a consecutive-like pointer used by a reverse interleaved group is
				; recognized as uniform and remains uniform after vectorization. When
				; interleaved memory accesses aren't enabled, the pointer should not be
				; recognized as uniform, and it should not be uniform after vectorization.
				;
				; recognized as uniform, and it should not be uniform after vectorization.
				; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
				; CHECK-NOT: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
				; CHECK: vector.body
				; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				; CHECK: %offset.idx = sub i64 %n, %index
				; CHECK: %[[I1:.+]] = add i64 %offset.idx, -1
				; CHECK: %[[I2:.+]] = add i64 %offset.idx, -2
				; CHECK: %[[I3:.+]] = add i64 %offset.idx, -3
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 0
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 1
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 1
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 1
				; CHECK: getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 1
				; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
				;
				; INTER: LV: Found uniform instruction: %tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
				; INTER: LV: Found uniform instruction: %tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
				; INTER: vector.body
				; INTER: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				; INTER: %offset.idx = sub i64 %n, %index
				; INTER-NOT: getelementptr
				; INTER: %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 0
				; INTER: getelementptr i32, i32* %[[G0]], i64 -6
				; INTER-NOT: getelementptr
				; INTER: br i1 {{.*}}, label %middle.block, label %vector.body
				;
				define i32 @interleaved_access_reverse(%pair* %p, i64 %n) {
				entry:
				br label %for.body

				for.body:
				%i = phi i64 [ %i.next, %for.body ], [ %n, %entry ]
				%tmp0 = phi i32 [ %tmp6, %for.body ], [ 0, %entry ]
				%tmp1 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
				%tmp2 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 1
				%tmp3 = load i32, i32* %tmp1, align 8
				%tmp4 = load i32, i32* %tmp2, align 8
				%tmp5 = add i32 %tmp3, %tmp4
				%tmp6 = add i32 %tmp0, %tmp5
				%i.next = add nuw nsw i64 %i, -1
				%cond = icmp sgt i64 %i.next, 0
				br i1 %cond, label %for.body, label %for.end

				for.end:
				%tmp14 = phi i32 [ %tmp6, %for.body ]
				ret i32 %tmp14
				}

				; CHECK-LABEL: predicated_store
				;
				; Check that a consecutive-like pointer used by a forward interleaved group and
				; scalarized store is not recognized as uniform and is not uniform after
				; vectorization. The store is scalarized because it's in a predicated block.
				; Even though the load in this example is vectorized and only uses the pointer
				; as if it were uniform, the store is scalarized, making the pointer
				; non-uniform.
				;
				; INTER-NOT: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
				; INTER: vector.body
				; INTER: %index = phi i64 [ 0, %vector.ph ], [ %index.next, {{.*}} ]
				; INTER: %[[I1:.+]] = or i64 %index, 1
				; INTER: %[[I2:.+]] = or i64 %index, 2
				; INTER: %[[I3:.+]] = or i64 %index, 3
				; INTER: %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0
				; INTER: getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0
				; INTER: getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0
				; INTER: getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0
				; INTER: %[[B0:.+]] = bitcast i32* %[[G0]] to <8 x i32>*
				; INTER: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 8
				; INTER: br i1 {{.*}}, label %middle.block, label %vector.body
				;
				define void @predicated_store(%pair *%p, i32 %x, i64 %n) {
				entry:
				br label %for.body

				for.body:
				%i = phi i64 [ %i.next, %if.merge ], [ 0, %entry ]
				%tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0
				%tmp1 = load i32, i32* %tmp0, align 8
				%tmp2 = icmp eq i32 %tmp1, %x
				br i1 %tmp2, label %if.then, label %if.merge

				if.then:
				store i32 %tmp1, i32* %tmp0, align 8
				br label %if.merge

				if.merge:
				%i.next = add nuw nsw i64 %i, 1
				%cond = icmp slt i64 %i.next, %n
				br i1 %cond, label %for.body, label %for.end

				for.end:
				ret void
				}

				; CHECK-LABEL: irregular_type
				;
				; Check that a consecutive pointer used by a scalarized store is not recognized
				; as uniform and is not uniform after vectorization. The store is scalarized
				; because the stored type may required padding.
				;
				; CHECK-NOT: LV: Found uniform instruction: %tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %i
				; CHECK: vector.body
				; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
				; CHECK: %[[I1:.+]] = or i64 %index, 1
				; CHECK: %[[I2:.+]] = or i64 %index, 2
				; CHECK: %[[I3:.+]] = or i64 %index, 3
				; CHECK: getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %index
				; CHECK: getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %[[I1]]
				; CHECK: getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %[[I2]]
				; CHECK: getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %[[I3]]
				; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
				;
				define void @irregular_type(x86_fp80* %a, i64 %n) {
				entry:
				br label %for.body

				for.body:
				%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
				%tmp0 = sitofp i32 1 to x86_fp80
				%tmp1 = getelementptr inbounds x86_fp80, x86_fp80* %a, i64 %i
				store x86_fp80 %tmp0, x86_fp80* %tmp1, align 16
				%i.next = add i64 %i, 1
				%cond = icmp slt i64 %i.next, %n
				br i1 %cond, label %for.body, label %for.end

				for.end:
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[LV] Don't mark pointers used by scalarized memory accesses uniform
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 70737

llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LV] Don't mark pointers used by scalarized memory accesses uniformClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 70737

llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/trunk/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll

[LV] Don't mark pointers used by scalarized memory accesses uniform
ClosedPublic