Diff 278737

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,024 Lines • ▼ Show 20 Lines	public:
void setCostBasedWideningDecision(unsigned VF);		void setCostBasedWideningDecision(unsigned VF);

/// A struct that represents some properties of the register usage		/// A struct that represents some properties of the register usage
/// of a loop.		/// of a loop.
struct RegisterUsage {		struct RegisterUsage {
/// Holds the number of loop invariant values that are used in the loop.		/// Holds the number of loop invariant values that are used in the loop.
/// The key is ClassID of target-provided register class.		/// The key is ClassID of target-provided register class.
SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;		SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
/// Holds the maximum number of concurrent live intervals in the loop.		/// Holds the maximum number of concurrent live intervals in the loop.
		dmgreenUnsubmitted Not Done Reply Inline Actions Latch might be a better name here. dmgreen: Latch might be a better name here.
/// The key is ClassID of target-provided register class.		/// The key is ClassID of target-provided register class.
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;		SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
};		};

/// \return Returns information about the register usages of the loop for the		/// \return Returns information about the register usages of the loop for the
/// given vectorization factors.		/// given vectorization factors.
SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);		SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);

▲ Show 20 Lines • Show All 3,142 Lines • ▼ Show 20 Lines	if (EnableVPlanNativePath) {

return;		return;
}		}

assert(PN->getParent() == OrigLoop->getHeader() &&		assert(PN->getParent() == OrigLoop->getHeader() &&
"Non-header phis should have been handled elsewhere");		"Non-header phis should have been handled elsewhere");

// In order to support recurrences we need to be able to vectorize Phi nodes.		// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is		// Phi nodes have cycles, so we need to vectorize them in two stages. This is
		SjoerdMeijerUnsubmitted Done Reply Inline Actions Ah, look at this....I was actually wondering about this last week, and now I found it :) SjoerdMeijer: Ah, look at this....I was actually wondering about this last week, and now I found it :)
// stage #1: We create a new vector PHI node with no incoming edges. We'll use		// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.		// this value when we vectorize all of the instructions that use the PHI.
if (Legal->isReductionVariable(P) \|\| Legal->isFirstOrderRecurrence(P)) {		if (Legal->isReductionVariable(P) \|\| Legal->isFirstOrderRecurrence(P)) {
for (unsigned Part = 0; Part < UF; ++Part) {		for (unsigned Part = 0; Part < UF; ++Part) {
// This is phase one of vectorizing PHIs.		// This is phase one of vectorizing PHIs.
Type *VecTy =		Type *VecTy =
(VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);		(VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
Value *EntryPart = PHINode::Create(		Value *EntryPart = PHINode::Create(
Show All 13 Lines	void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();		const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();

// FIXME: The newly created binary instructions should contain nsw/nuw flags,		// FIXME: The newly created binary instructions should contain nsw/nuw flags,
// which can be found from the original scalar operations.		// which can be found from the original scalar operations.
switch (II.getKind()) {		switch (II.getKind()) {
case InductionDescriptor::IK_NoInduction:		case InductionDescriptor::IK_NoInduction:
llvm_unreachable("Unknown induction");		llvm_unreachable("Unknown induction");
case InductionDescriptor::IK_IntInduction:		case InductionDescriptor::IK_IntInduction:
case InductionDescriptor::IK_FpInduction:		case InductionDescriptor::IK_FpInduction:
SjoerdMeijerUnsubmitted Done Reply Inline Actions nit: unnecessary change? SjoerdMeijer: nit: unnecessary change?
anwelAuthorUnsubmitted Done Reply Inline Actions Yes, but also unnecessary return in the first place, and a change very close to the relevant changes? anwel: Yes, but also unnecessary return in the first place, and a change very close to the relevant…
		SjoerdMeijerUnsubmitted Done Reply Inline Actions Nit: can we just return here, and get rid of the "else"? That saves some indentation. SjoerdMeijer: Nit: can we just return here, and get rid of the "else"? That saves some indentation.
		anwelAuthorUnsubmitted Done Reply Inline Actions Yes, we can. anwel: Yes, we can.
llvm_unreachable("Integer/fp induction is handled elsewhere.");		llvm_unreachable("Integer/fp induction is handled elsewhere.");
		SjoerdMeijerUnsubmitted Done Reply Inline Actions Nit: I don't know if the step is always a constant for a PtrInduction, so don't know if this should an assert or just a return. SjoerdMeijer: Nit: I don't know if the step is always a constant for a PtrInduction, so don't know if this…
		anwelAuthorUnsubmitted Done Reply Inline Actions Using the assert here was inspired by emitTransformedIndex, which was - and, if Cost->isScalarAfterVectorization(P, VF) is true, still is - used for scalarising the induction. It uses the exact same assert for the step of a pointer induction. anwel: Using the assert here was inspired by emitTransformedIndex, which was - and, if Cost…
case InductionDescriptor::IK_PtrInduction: {		case InductionDescriptor::IK_PtrInduction: {
// Handle the pointer induction variable case.		// Handle the pointer induction variable case.
assert(P->getType()->isPointerTy() && "Unexpected type.");		assert(P->getType()->isPointerTy() && "Unexpected type.");

		if (Cost->isScalarAfterVectorization(P, VF)) {
// This is the normalized GEP that starts counting at zero.		// This is the normalized GEP that starts counting at zero.
Value *PtrInd = Induction;		Value *PtrInd =
		SjoerdMeijerUnsubmitted Done Reply Inline Actions Nit: NewOldPhi is a bit of a confusing name. SjoerdMeijer: Nit: NewOldPhi is a bit of a confusing name.
		anwelAuthorUnsubmitted Done Reply Inline Actions Okay, maybe a bit confusing. I renamed it to NewPointerPhi, that's a bit more straightforward. anwel: Okay, maybe a bit confusing. I renamed it to NewPointerPhi, that's a bit more straightforward.
PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());		Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
// Determine the number of scalars we need to generate for each unroll		// Determine the number of scalars we need to generate for each unroll
// iteration. If the instruction is uniform, we only need to generate the		// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.		// first lane. Otherwise, we generate all VF values.
unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;		unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
// These are the scalar results. Notice that we don't generate vector GEPs
// because scalar GEPs result in better code.
for (unsigned Part = 0; Part < UF; ++Part) {		for (unsigned Part = 0; Part < UF; ++Part) {
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {		for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Constant Idx = ConstantInt::get(PtrInd->getType(), Lane + Part VF);		Constant Idx = ConstantInt::get(PtrInd->getType(), Lane + Part VF);
Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);		Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
Value *SclrGep =		Value *SclrGep =
emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);		emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
SclrGep->setName("next.gep");		SclrGep->setName("next.gep");
VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);		VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
}		}
}		}
return;		return;
		SjoerdMeijerUnsubmitted Done Reply Inline Actions nit: this comment could be more descriptive. SjoerdMeijer: nit: this comment could be more descriptive.
		anwelAuthorUnsubmitted Done Reply Inline Actions Comment has been made more descriptive :) anwel: Comment has been made more descriptive :)
}		}
		assert(isa<SCEVConstant>(II.getStep()) &&
		"Induction step not a SCEV constant!");
		Type *PhiType = II.getStep()->getType();

		// Build a pointer phi
		Value *ScalarStartValue = II.getStartValue();
		Type *ScStValueType = ScalarStartValue->getType();
		PHINode *NewPointerPhi =
		PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
		NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);

		// A pointer induction, performed by using a gep
		BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
		Instruction *InductionLoc = LoopLatch->getTerminator();
		const SCEV *ScalarStep = II.getStep();
		SCEVExpander Exp(*PSE.getSE(), DL, "induction");
		Value *ScalarStepValue =
		Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
		Value *InductionGEP = GetElementPtrInst::Create(
		ScStValueType->getPointerElementType(), NewPointerPhi,
		Builder.CreateMul(ScalarStepValue, ConstantInt::get(PhiType, VF * UF)),
		"ptr.ind", InductionLoc);
		NewPointerPhi->addIncoming(InductionGEP, LoopLatch);

		// Create UF many actual address geps that use the pointer
		// phi as base and a vectorized version of the step value
		// (<step0, ..., stepN>) as offset.
		for (unsigned Part = 0; Part < UF; ++Part) {
		SmallVector<Constant *, 8> Indices;
		// Create a vector of consecutive numbers from zero to VF.
		for (unsigned i = 0; i < VF; ++i)
		Indices.push_back(ConstantInt::get(PhiType, i + Part * VF));
		Constant *StartOffset = ConstantVector::get(Indices);

		Value *GEP = Builder.CreateGEP(
		ScStValueType->getPointerElementType(), NewPointerPhi,
		Builder.CreateMul(StartOffset,
		Builder.CreateVectorSplat(VF, ScalarStepValue),
		"vector.gep"));
		VectorLoopValueMap.setVectorValue(P, Part, GEP);
		}
		}
}		}
}		}

/// A helper function for checking whether an integer division-related		/// A helper function for checking whether an integer division-related
/// instruction may divide by zero (in which case it must be predicated if		/// instruction may divide by zero (in which case it must be predicated if
/// executed conditionally in the scalar code).		/// executed conditionally in the scalar code).
/// TODO: It may be worthwhile to generalize and check isKnownNonZero().		/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
/// Non-zero divisors that are non compile-time constants will not be		/// Non-zero divisors that are non compile-time constants will not be
▲ Show 20 Lines • Show All 218 Lines • ▼ Show 20 Lines	assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
"This function should not be visited twice for the same VF");		"This function should not be visited twice for the same VF");

SmallSetVector<Instruction *, 8> Worklist;		SmallSetVector<Instruction *, 8> Worklist;

// These sets are used to seed the analysis with pointers used by memory		// These sets are used to seed the analysis with pointers used by memory
// accesses that will remain scalar.		// accesses that will remain scalar.
SmallSetVector<Instruction *, 8> ScalarPtrs;		SmallSetVector<Instruction *, 8> ScalarPtrs;
SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;		SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
		auto *Latch = TheLoop->getLoopLatch();

// A helper that returns true if the use of Ptr by MemAccess will be scalar.		// A helper that returns true if the use of Ptr by MemAccess will be scalar.
// The pointer operands of loads and stores will be scalar as long as the		// The pointer operands of loads and stores will be scalar as long as the
// memory access is not a gather or scatter operation. The value operand of a		// memory access is not a gather or scatter operation. The value operand of a
// store will remain scalar if the store is scalarized.		// store will remain scalar if the store is scalarized.
auto isScalarUse = [&](Instruction MemAccess, Value Ptr) {		auto isScalarUse = [&](Instruction MemAccess, Value Ptr) {
InstWidening WideningDecision = getWideningDecision(MemAccess, VF);		InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
assert(WideningDecision != CM_Unknown &&		assert(WideningDecision != CM_Unknown &&
"Widening decision should be ready at this moment");		"Widening decision should be ready at this moment");
if (auto *Store = dyn_cast<StoreInst>(MemAccess))		if (auto *Store = dyn_cast<StoreInst>(MemAccess))
if (Ptr == Store->getValueOperand())		if (Ptr == Store->getValueOperand())
		SjoerdMeijerUnsubmitted Not Done Reply Inline Actions Nit: this comment is slightly out of date, i.e. the `ScalarPtrs` part.... SjoerdMeijer: Nit: this comment is slightly out of date, i.e. the `ScalarPtrs` part....
return WideningDecision == CM_Scalarize;		return WideningDecision == CM_Scalarize;
assert(Ptr == getLoadStorePointerOperand(MemAccess) &&		assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
"Ptr is neither a value or pointer operand");		"Ptr is neither a value or pointer operand");
return WideningDecision != CM_GatherScatter;		return WideningDecision != CM_GatherScatter;
		SjoerdMeijerUnsubmitted Not Done Reply Inline Actions .... because we are adding things to `Worklist` here. I was getting confused a bit if this should not be `ScalarsPtrs`, but because it is an induction it makes sense to add it to Worklist? SjoerdMeijer: .... because we are adding things to `Worklist` here. I was getting confused a bit if this…
};		};

// A helper that returns true if the given value is a bitcast or		// A helper that returns true if the given value is a bitcast or
// getelementptr instruction contained in the loop.		// getelementptr instruction contained in the loop.
auto isLoopVaryingBitCastOrGEP = [&](Value *V) {		auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) \|\|		return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) \|\|
isa<GetElementPtrInst>(V)) &&		isa<GetElementPtrInst>(V)) &&
!TheLoop->isLoopInvariant(V);		!TheLoop->isLoopInvariant(V);
};		};

// A helper that evaluates a memory access's use of a pointer. If the use		auto isScalarPtrInduction = [&](Instruction MemAccess, Value Ptr) {
// will be a scalar use, and the pointer is only used by memory accesses, we		if (!isa<PHINode>(Ptr) \|\|
// place the pointer in ScalarPtrs. Otherwise, the pointer is placed in		!Legal->getInductionVars().count(cast<PHINode>(Ptr)))
// PossibleNonScalarPtrs.		return false;
		auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
		if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
		return false;
		return isScalarUse(MemAccess, Ptr);
		};

		// A helper that evaluates a memory access's use of a pointer. If the
		// pointer is actually the pointer induction of a loop, it is being
		// inserted into Worklist. If the use will be a scalar use, and the
		// pointer is only used by memory accesses, we place the pointer in
		// ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction MemAccess, Value Ptr) {		auto evaluatePtrUse = [&](Instruction MemAccess, Value Ptr) {
		if (isScalarPtrInduction(MemAccess, Ptr)) {
		Worklist.insert(cast<Instruction>(Ptr));
		Instruction *Update = cast<Instruction>(
		cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
		Worklist.insert(Update);
		LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
		<< "\n");
		LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
		<< "\n");
		SjoerdMeijerUnsubmitted Done Reply Inline Actions Nit: missing `.` SjoerdMeijer: Nit: missing `.`
		return;
		}
// We only care about bitcast and getelementptr instructions contained in		// We only care about bitcast and getelementptr instructions contained in
// the loop.		// the loop.
if (!isLoopVaryingBitCastOrGEP(Ptr))		if (!isLoopVaryingBitCastOrGEP(Ptr))
return;		return;

// If the pointer has already been identified as scalar (e.g., if it was		// If the pointer has already been identified as scalar (e.g., if it was
// also identified as uniform), there's nothing to do.		// also identified as uniform), there's nothing to do.
auto *I = cast<Instruction>(Ptr);		auto *I = cast<Instruction>(Ptr);
if (Worklist.count(I))		if (Worklist.count(I))
return;		return;

// If the use of the pointer will be a scalar use, and all users of the		// If the use of the pointer will be a scalar use, and all users of the
// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,		// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
// place the pointer in PossibleNonScalarPtrs.		// place the pointer in PossibleNonScalarPtrs.
if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {		if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
return isa<LoadInst>(U) \|\| isa<StoreInst>(U);		return isa<LoadInst>(U) \|\| isa<StoreInst>(U);
}))		}))
ScalarPtrs.insert(I);		ScalarPtrs.insert(I);
else		else
PossibleNonScalarPtrs.insert(I);		PossibleNonScalarPtrs.insert(I);
};		};

// We seed the scalars analysis with three classes of instructions: (1)		// We seed the scalars analysis with three classes of instructions: (1)
// instructions marked uniform-after-vectorization, (2) bitcast and		// instructions marked uniform-after-vectorization and (2) bitcast,
// getelementptr instructions used by memory accesses requiring a scalar use,		// getelementptr and (pointer) phi instructions used by memory accesses
// and (3) pointer induction variables and their update instructions (we		// requiring a scalar use.
// currently only scalarize these).
//		//
// (1) Add to the worklist all instructions that have been identified as		// (1) Add to the worklist all instructions that have been identified as
// uniform-after-vectorization.		// uniform-after-vectorization.
Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());		Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());

// (2) Add to the worklist all bitcast and getelementptr instructions used by		// (2) Add to the worklist all bitcast and getelementptr instructions used by
// memory accesses requiring a scalar use. The pointer operands of loads and		// memory accesses requiring a scalar use. The pointer operands of loads and
// stores will be scalar as long as the memory accesses is not a gather or		// stores will be scalar as long as the memory accesses is not a gather or
Show All 9 Lines	for (auto &I : *BB) {
}		}
}		}
for (auto *I : ScalarPtrs)		for (auto *I : ScalarPtrs)
if (!PossibleNonScalarPtrs.count(I)) {		if (!PossibleNonScalarPtrs.count(I)) {
LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");		LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
Worklist.insert(I);		Worklist.insert(I);
}		}

// (3) Add to the worklist all pointer induction variables and their update
// instructions.
//
// TODO: Once we are able to vectorize pointer induction variables we should
// no longer insert them into the worklist here.
auto *Latch = TheLoop->getLoopLatch();
for (auto &Induction : Legal->getInductionVars()) {
auto *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
continue;
Worklist.insert(Ind);
Worklist.insert(IndUpdate);
LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
<< "\n");
}

// Insert the forced scalars.		// Insert the forced scalars.
// FIXME: Currently widenPHIInstruction() often creates a dead vector		// FIXME: Currently widenPHIInstruction() often creates a dead vector
// induction variable when the PHI user is scalarized.		// induction variable when the PHI user is scalarized.
auto ForcedScalar = ForcedScalars.find(VF);		auto ForcedScalar = ForcedScalars.find(VF);
if (ForcedScalar != ForcedScalars.end())		if (ForcedScalar != ForcedScalars.end())
for (auto *I : ForcedScalar->second)		for (auto *I : ForcedScalar->second)
Worklist.insert(I);		Worklist.insert(I);

Show All 19 Lines	void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
}		}

// An induction variable will remain scalar if all users of the induction		// An induction variable will remain scalar if all users of the induction
// variable and induction variable update remain scalar.		// variable and induction variable update remain scalar.
for (auto &Induction : Legal->getInductionVars()) {		for (auto &Induction : Legal->getInductionVars()) {
auto *Ind = Induction.first;		auto *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));		auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

// We already considered pointer induction variables, so there's no reason
// to look at their users again.
//
// TODO: Once we are able to vectorize pointer induction variables we
// should no longer skip over them here.
if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
continue;

// If tail-folding is applied, the primary induction variable will be used		// If tail-folding is applied, the primary induction variable will be used
// to feed a vector compare.		// to feed a vector compare.
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())		if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
continue;		continue;

// Determine if all users of the induction variable are scalar after		// Determine if all users of the induction variable are scalar after
// vectorization.		// vectorization.
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {		auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
▲ Show 20 Lines • Show All 3,497 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -loop-vectorize -S -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -dce -instcombine --simplifycfg -enable-arm-maskedgatscat < %s \| FileCheck %s

				target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
				target triple = "thumbv8.1m.main-none-none-eabi"

				define hidden void @pointer_phi_v4i32_add1(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %s, i32%y) {
				; CHECK-LABEL: @pointer_phi_v4i32_add1(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <4 x i32> undef, i32 [[Y:%.]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr i32, i32 [[A:%.*]], i32 [[INDEX]]
				; CHECK-NEXT: [[NEXT_GEP4:%.]] = getelementptr i32, i32 [[B:%.*]], i32 [[INDEX]]
				; CHECK-NEXT: [[TMP0:%.]] = bitcast i32 [[NEXT_GEP]] to <4 x i32>*
				; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <4 x i32>, <4 x i32> [[TMP0]], align 4
				; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i32 [[NEXT_GEP4]] to <4 x i32>*
				; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
				; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
				; CHECK-NEXT: br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body
				for.body:
				%A.addr.09 = phi i32* [ %add.ptr, %for.body ], [ %A, %entry ]
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
				%B.addr.07 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
				%0 = load i32, i32* %A.addr.09, align 4
				%add.ptr = getelementptr inbounds i32, i32* %A.addr.09, i32 1
				%add = add nsw i32 %0, %y
				store i32 %add, i32* %B.addr.07, align 4
				%incdec.ptr = getelementptr inbounds i32, i32* %B.addr.07, i32 1
				%inc = add nuw nsw i32 %i.08, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v4i32_add2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) {
				; CHECK-LABEL: @pointer_phi_v4i32_add2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr i32, i32 [[A:%.*]], i32 1992
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr i32, i32 [[B:%.*]], i32 996
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <4 x i32> undef, i32 [[Y:%.]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 1
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr i32, i32 [[A]], i32 [[TMP0]]
				; CHECK-NEXT: [[NEXT_GEP4:%.]] = getelementptr i32, i32 [[B]], i32 [[INDEX]]
				; CHECK-NEXT: [[TMP1:%.]] = bitcast i32 [[NEXT_GEP]] to <8 x i32>*
				; CHECK-NEXT: [[WIDE_VEC:%.]] = load <8 x i32>, <8 x i32> [[TMP1]], align 4
				; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
				; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP3:%.]] = bitcast i32 [[NEXT_GEP4]] to <4 x i32>*
				; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
				; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
				; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !2
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_09:%.]] = phi i32 [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_07:%.]] = phi i32 [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP5:%.]] = load i32, i32 [[A_ADDR_09]], align 4
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_09]], i32 2
				; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], [[Y]]
				; CHECK-NEXT: store i32 [[ADD]], i32* [[B_ADDR_07]], align 4
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_07]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !3
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body
				for.body:
				%A.addr.09 = phi i32* [ %add.ptr, %for.body ], [ %A, %entry ]
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
				%B.addr.07 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
				%0 = load i32, i32* %A.addr.09, align 4
				%add.ptr = getelementptr inbounds i32, i32* %A.addr.09, i32 2
				%add = add nsw i32 %0, %y
				store i32 %add, i32* %B.addr.07, align 4
				%incdec.ptr = getelementptr inbounds i32, i32* %B.addr.07, i32 1
				%inc = add nuw nsw i32 %i.08, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v4i32_add3(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) {
				; CHECK-LABEL: @pointer_phi_v4i32_add3(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr i32, i32 [[A:%.*]], i32 2988
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr i32, i32 [[B:%.*]], i32 996
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <4 x i32> undef, i32 [[Y:%.]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[POINTER_PHI:%.]] = phi i32 [ [[A]], [[ENTRY:%.]] ], [ [[PTR_IND:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.]] = getelementptr i32, i32 [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr i32, i32 [[B]], i32 [[INDEX]]
				; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
				; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i32 [[NEXT_GEP]] to <4 x i32>*
				; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
				; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
				; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 12
				; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_09:%.]] = phi i32 [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_07:%.]] = phi i32 [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP4:%.]] = load i32, i32 [[A_ADDR_09]], align 4
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_09]], i32 3
				; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[Y]]
				; CHECK-NEXT: store i32 [[ADD]], i32* [[B_ADDR_07]], align 4
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_07]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !6
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body
				for.body:
				%A.addr.09 = phi i32* [ %add.ptr, %for.body ], [ %A, %entry ]
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
				%B.addr.07 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
				%0 = load i32, i32* %A.addr.09, align 4
				%add.ptr = getelementptr inbounds i32, i32* %A.addr.09, i32 3
				%add = add nsw i32 %0, %y
				store i32 %add, i32* %B.addr.07, align 4
				%incdec.ptr = getelementptr inbounds i32, i32* %B.addr.07, i32 1
				%inc = add nuw nsw i32 %i.08, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v8i16_add1(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i32 %y) {
				; CHECK-LABEL: @pointer_phi_v8i16_add1(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[TMP0:%.]] = trunc i32 [[Y:%.]] to i16
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP0]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr i16, i16 [[A:%.*]], i32 [[INDEX]]
				; CHECK-NEXT: [[NEXT_GEP4:%.]] = getelementptr i16, i16 [[B:%.*]], i32 [[INDEX]]
				; CHECK-NEXT: [[TMP1:%.]] = bitcast i16 [[NEXT_GEP]] to <8 x i16>*
				; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <8 x i16>, <8 x i16> [[TMP1]], align 2
				; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP3:%.]] = bitcast i16 [[NEXT_GEP4]] to <8 x i16>*
				; CHECK-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* [[TMP3]], align 2
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
				; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
				; CHECK-NEXT: br i1 [[TMP4]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				%0 = trunc i32 %y to i16
				br label %for.body
				for.body: ; preds = %for.body, %for.body.lr.ph
				%A.addr.011 = phi i16* [ %A, %entry ], [ %add.ptr, %for.body ]
				%i.010 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%B.addr.09 = phi i16* [ %B, %entry ], [ %incdec.ptr, %for.body ]
				%l1 = load i16, i16* %A.addr.011, align 2
				%add.ptr = getelementptr inbounds i16, i16* %A.addr.011, i32 1
				%conv1 = add i16 %l1, %0
				store i16 %conv1, i16* %B.addr.09, align 2
				%incdec.ptr = getelementptr inbounds i16, i16* %B.addr.09, i32 1
				%inc = add nuw nsw i32 %i.010, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v8i16_add2(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i32 %y) {
				; CHECK-LABEL: @pointer_phi_v8i16_add2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[TMP0:%.]] = trunc i32 [[Y:%.]] to i16
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr i16, i16 [[A:%.*]], i32 1984
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr i16, i16 [[B:%.*]], i32 992
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP0]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr i16, i16 [[A]], i32 [[TMP1]]
				; CHECK-NEXT: [[NEXT_GEP4:%.]] = getelementptr i16, i16 [[B]], i32 [[INDEX]]
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i16 [[NEXT_GEP]] to <16 x i16>*
				; CHECK-NEXT: [[WIDE_VEC:%.]] = load <16 x i16>, <16 x i16> [[TMP2]], align 2
				; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i16> [[WIDE_VEC]], <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
				; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP4:%.]] = bitcast i16 [[NEXT_GEP4]] to <8 x i16>*
				; CHECK-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* [[TMP4]], align 2
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
				; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
				; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_011:%.]] = phi i16 [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[I_010:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_09:%.]] = phi i16 [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[L1:%.]] = load i16, i16 [[A_ADDR_011]], align 2
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, i16* [[A_ADDR_011]], i32 2
				; CHECK-NEXT: [[CONV1:%.*]] = add i16 [[L1]], [[TMP0]]
				; CHECK-NEXT: store i16 [[CONV1]], i16* [[B_ADDR_09]], align 2
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[B_ADDR_09]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !9
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				%0 = trunc i32 %y to i16
				br label %for.body
				for.body: ; preds = %for.body, %for.body.lr.ph
				%A.addr.011 = phi i16* [ %A, %entry ], [ %add.ptr, %for.body ]
				%i.010 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%B.addr.09 = phi i16* [ %B, %entry ], [ %incdec.ptr, %for.body ]
				%l1 = load i16, i16* %A.addr.011, align 2
				%add.ptr = getelementptr inbounds i16, i16* %A.addr.011, i32 2
				%conv1 = add i16 %l1, %0
				store i16 %conv1, i16* %B.addr.09, align 2
				%incdec.ptr = getelementptr inbounds i16, i16* %B.addr.09, i32 1
				%inc = add nuw nsw i32 %i.010, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v8i16_add3(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i32 %y) {
				; CHECK-LABEL: @pointer_phi_v8i16_add3(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[TMP0:%.]] = trunc i32 [[Y:%.]] to i16
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_011:%.]] = phi i16 [ [[A:%.]], [[ENTRY:%.]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[I_010:%.]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_09:%.]] = phi i16 [ [[B:%.]], [[ENTRY]] ], [ [[INCDEC_PTR:%.]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[L1:%.]] = load i16, i16 [[A_ADDR_011]], align 2
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, i16* [[A_ADDR_011]], i32 3
				; CHECK-NEXT: [[CONV1:%.*]] = add i16 [[L1]], [[TMP0]]
				; CHECK-NEXT: store i16 [[CONV1]], i16* [[B_ADDR_09]], align 2
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[B_ADDR_09]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_010]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]]
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				%0 = trunc i32 %y to i16
				br label %for.body
				for.body: ; preds = %for.body, %for.body.lr.ph
				%A.addr.011 = phi i16* [ %A, %entry ], [ %add.ptr, %for.body ]
				%i.010 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%B.addr.09 = phi i16* [ %B, %entry ], [ %incdec.ptr, %for.body ]
				%l1 = load i16, i16* %A.addr.011, align 2
				%add.ptr = getelementptr inbounds i16, i16* %A.addr.011, i32 3
				%conv1 = add i16 %l1, %0
				store i16 %conv1, i16* %B.addr.09, align 2
				%incdec.ptr = getelementptr inbounds i16, i16* %B.addr.09, i32 1
				%inc = add nuw nsw i32 %i.010, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v16i8_add1(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i32 %y) {
				; CHECK-LABEL: @pointer_phi_v16i8_add1(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[TMP0:%.]] = trunc i32 [[Y:%.]] to i8
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr i8, i8 [[A:%.*]], i32 992
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr i8, i8 [[B:%.*]], i32 992
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr i8, i8 [[A]], i32 [[INDEX]]
				; CHECK-NEXT: [[NEXT_GEP4:%.]] = getelementptr i8, i8 [[B]], i32 [[INDEX]]
				; CHECK-NEXT: [[TMP1:%.]] = bitcast i8 [[NEXT_GEP]] to <16 x i8>*
				; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <16 x i8>, <16 x i8> [[TMP1]], align 1
				; CHECK-NEXT: [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP3:%.]] = bitcast i8 [[NEXT_GEP4]] to <16 x i8>*
				; CHECK-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* [[TMP3]], align 1
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
				; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
				; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_010:%.]] = phi i8 [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[I_09:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_08:%.]] = phi i8 [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP5:%.]] = load i8, i8 [[A_ADDR_010]], align 1
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_010]], i32 1
				; CHECK-NEXT: [[CONV1:%.*]] = add i8 [[TMP5]], [[TMP0]]
				; CHECK-NEXT: store i8 [[CONV1]], i8* [[B_ADDR_08]], align 1
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[B_ADDR_08]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_09]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !11
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				%0 = trunc i32 %y to i8
				br label %for.body

				for.body:
				%A.addr.010 = phi i8* [ %A, %entry ], [ %add.ptr, %for.body ]
				%i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%B.addr.08 = phi i8* [ %B, %entry ], [ %incdec.ptr, %for.body ]
				%1 = load i8, i8* %A.addr.010, align 1
				%add.ptr = getelementptr inbounds i8, i8* %A.addr.010, i32 1
				%conv1 = add i8 %1, %0
				store i8 %conv1, i8* %B.addr.08, align 1
				%incdec.ptr = getelementptr inbounds i8, i8* %B.addr.08, i32 1
				%inc = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v16i8_add2(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i32 %y) {
				; CHECK-LABEL: @pointer_phi_v16i8_add2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[TMP0:%.]] = trunc i32 [[Y:%.]] to i8
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr i8, i8 [[A:%.*]], i32 1984
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr i8, i8 [[B:%.*]], i32 992
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> undef, <16 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[INDEX]], 1
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr i8, i8 [[A]], i32 [[TMP1]]
				; CHECK-NEXT: [[NEXT_GEP4:%.]] = getelementptr i8, i8 [[B]], i32 [[INDEX]]
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i8 [[NEXT_GEP]] to <32 x i8>*
				; CHECK-NEXT: [[WIDE_VEC:%.]] = load <32 x i8>, <32 x i8> [[TMP2]], align 1
				; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
				; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i8> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP4:%.]] = bitcast i8 [[NEXT_GEP4]] to <16 x i8>*
				; CHECK-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* [[TMP4]], align 1
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
				; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
				; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_010:%.]] = phi i8 [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[I_09:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_08:%.]] = phi i8 [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP6:%.]] = load i8, i8 [[A_ADDR_010]], align 1
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_010]], i32 2
				; CHECK-NEXT: [[CONV1:%.*]] = add i8 [[TMP6]], [[TMP0]]
				; CHECK-NEXT: store i8 [[CONV1]], i8* [[B_ADDR_08]], align 1
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[B_ADDR_08]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_09]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !13
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				%0 = trunc i32 %y to i8
				br label %for.body

				for.body:
				%A.addr.010 = phi i8* [ %A, %entry ], [ %add.ptr, %for.body ]
				%i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%B.addr.08 = phi i8* [ %B, %entry ], [ %incdec.ptr, %for.body ]
				%1 = load i8, i8* %A.addr.010, align 1
				%add.ptr = getelementptr inbounds i8, i8* %A.addr.010, i32 2
				%conv1 = add i8 %1, %0
				store i8 %conv1, i8* %B.addr.08, align 1
				%incdec.ptr = getelementptr inbounds i8, i8* %B.addr.08, i32 1
				%inc = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v16i8_add3(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i32 %y) {
				; CHECK-LABEL: @pointer_phi_v16i8_add3(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[TMP0:%.]] = trunc i32 [[Y:%.]] to i8
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_010:%.]] = phi i8 [ [[A:%.]], [[ENTRY:%.]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[I_09:%.]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_08:%.]] = phi i8 [ [[B:%.]], [[ENTRY]] ], [ [[INCDEC_PTR:%.]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[TMP1:%.]] = load i8, i8 [[A_ADDR_010]], align 1
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, i8* [[A_ADDR_010]], i32 3
				; CHECK-NEXT: [[CONV1:%.*]] = add i8 [[TMP1]], [[TMP0]]
				; CHECK-NEXT: store i8 [[CONV1]], i8* [[B_ADDR_08]], align 1
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[B_ADDR_08]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_09]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]]
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				%0 = trunc i32 %y to i8
				br label %for.body

				for.body:
				%A.addr.010 = phi i8* [ %A, %entry ], [ %add.ptr, %for.body ]
				%i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%B.addr.08 = phi i8* [ %B, %entry ], [ %incdec.ptr, %for.body ]
				%1 = load i8, i8* %A.addr.010, align 1
				%add.ptr = getelementptr inbounds i8, i8* %A.addr.010, i32 3
				%conv1 = add i8 %1, %0
				store i8 %conv1, i8* %B.addr.08, align 1
				%incdec.ptr = getelementptr inbounds i8, i8* %B.addr.08, i32 1
				%inc = add nuw nsw i32 %i.09, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v4f32_add1(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) {
				; CHECK-LABEL: @pointer_phi_v4f32_add1(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <4 x float> undef, float [[Y:%.]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr float, float [[A:%.*]], i32 [[INDEX]]
				; CHECK-NEXT: [[NEXT_GEP4:%.]] = getelementptr float, float [[B:%.*]], i32 [[INDEX]]
				; CHECK-NEXT: [[TMP0:%.]] = bitcast float [[NEXT_GEP]] to <4 x float>*
				; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <4 x float>, <4 x float> [[TMP0]], align 4
				; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP2:%.]] = bitcast float [[NEXT_GEP4]] to <4 x float>*
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
				; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
				; CHECK-NEXT: br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body
				for.body:
				%A.addr.09 = phi float* [ %add.ptr, %for.body ], [ %A, %entry ]
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
				%B.addr.07 = phi float* [ %incdec.ptr, %for.body ], [ %B, %entry ]
				%0 = load float, float* %A.addr.09, align 4
				%add.ptr = getelementptr inbounds float, float* %A.addr.09, i32 1
				%add = fadd fast float %0, %y
				store float %add, float* %B.addr.07, align 4
				%incdec.ptr = getelementptr inbounds float, float* %B.addr.07, i32 1
				%inc = add nuw nsw i32 %i.08, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v4f32_add2(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) {
				; CHECK-LABEL: @pointer_phi_v4f32_add2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr float, float [[A:%.*]], i32 1992
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr float, float [[B:%.*]], i32 996
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <4 x float> undef, float [[Y:%.]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 1
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr float, float [[A]], i32 [[TMP0]]
				; CHECK-NEXT: [[NEXT_GEP4:%.]] = getelementptr float, float [[B]], i32 [[INDEX]]
				; CHECK-NEXT: [[TMP1:%.]] = bitcast float [[NEXT_GEP]] to <8 x float>*
				; CHECK-NEXT: [[WIDE_VEC:%.]] = load <8 x float>, <8 x float> [[TMP1]], align 4
				; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
				; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP3:%.]] = bitcast float [[NEXT_GEP4]] to <4 x float>*
				; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
				; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
				; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !15
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_09:%.]] = phi float [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_07:%.]] = phi float [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP5:%.]] = load float, float [[A_ADDR_09]], align 4
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[A_ADDR_09]], i32 2
				; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP5]], [[Y]]
				; CHECK-NEXT: store float [[ADD]], float* [[B_ADDR_07]], align 4
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[B_ADDR_07]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !16
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body
				for.body:
				%A.addr.09 = phi float* [ %add.ptr, %for.body ], [ %A, %entry ]
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
				%B.addr.07 = phi float* [ %incdec.ptr, %for.body ], [ %B, %entry ]
				%0 = load float, float* %A.addr.09, align 4
				%add.ptr = getelementptr inbounds float, float* %A.addr.09, i32 2
				%add = fadd fast float %0, %y
				store float %add, float* %B.addr.07, align 4
				%incdec.ptr = getelementptr inbounds float, float* %B.addr.07, i32 1
				%inc = add nuw nsw i32 %i.08, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v4f32_add3(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) {
				; CHECK-LABEL: @pointer_phi_v4f32_add3(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr float, float [[A:%.*]], i32 2988
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr float, float [[B:%.*]], i32 996
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <4 x float> undef, float [[Y:%.]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[POINTER_PHI:%.]] = phi float [ [[A]], [[ENTRY:%.]] ], [ [[PTR_IND:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.]] = getelementptr float, float [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr float, float [[B]], i32 [[INDEX]]
				; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
				; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP2:%.]] = bitcast float [[NEXT_GEP]] to <4 x float>*
				; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
				; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 996
				; CHECK-NEXT: [[PTR_IND]] = getelementptr float, float* [[POINTER_PHI]], i32 12
				; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !17
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_09:%.]] = phi float [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_07:%.]] = phi float [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP4:%.]] = load float, float [[A_ADDR_09]], align 4
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[A_ADDR_09]], i32 3
				; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[Y]]
				; CHECK-NEXT: store float [[ADD]], float* [[B_ADDR_07]], align 4
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[B_ADDR_07]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !18
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body
				for.body:
				%A.addr.09 = phi float* [ %add.ptr, %for.body ], [ %A, %entry ]
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
				%B.addr.07 = phi float* [ %incdec.ptr, %for.body ], [ %B, %entry ]
				%0 = load float, float* %A.addr.09, align 4
				%add.ptr = getelementptr inbounds float, float* %A.addr.09, i32 3
				%add = fadd fast float %0, %y
				store float %add, float* %B.addr.07, align 4
				%incdec.ptr = getelementptr inbounds float, float* %B.addr.07, i32 1
				%inc = add nuw nsw i32 %i.08, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v4half_add1(half* noalias nocapture readonly %A, half* noalias nocapture %B, half %y) {
				; CHECK-LABEL: @pointer_phi_v4half_add1(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <8 x half> undef, half [[Y:%.]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x half> [[BROADCAST_SPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr half, half [[A:%.*]], i32 [[INDEX]]
				; CHECK-NEXT: [[NEXT_GEP4:%.]] = getelementptr half, half [[B:%.*]], i32 [[INDEX]]
				; CHECK-NEXT: [[TMP0:%.]] = bitcast half [[NEXT_GEP]] to <8 x half>*
				; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <8 x half>, <8 x half> [[TMP0]], align 4
				; CHECK-NEXT: [[TMP1:%.*]] = fadd fast <8 x half> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP2:%.]] = bitcast half [[NEXT_GEP4]] to <8 x half>*
				; CHECK-NEXT: store <8 x half> [[TMP1]], <8 x half>* [[TMP2]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
				; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
				; CHECK-NEXT: br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body
				for.body:
				%A.addr.09 = phi half* [ %add.ptr, %for.body ], [ %A, %entry ]
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
				%B.addr.07 = phi half* [ %incdec.ptr, %for.body ], [ %B, %entry ]
				%0 = load half, half* %A.addr.09, align 4
				%add.ptr = getelementptr inbounds half, half* %A.addr.09, i32 1
				%add = fadd fast half %0, %y
				store half %add, half* %B.addr.07, align 4
				%incdec.ptr = getelementptr inbounds half, half* %B.addr.07, i32 1
				%inc = add nuw nsw i32 %i.08, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v4half_add2(half* noalias nocapture readonly %A, half* noalias nocapture %B, half %y) {
				; CHECK-LABEL: @pointer_phi_v4half_add2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr half, half [[A:%.*]], i32 1984
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr half, half [[B:%.*]], i32 992
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <8 x half> undef, half [[Y:%.]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x half> [[BROADCAST_SPLATINSERT]], <8 x half> undef, <8 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[INDEX]], 1
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr half, half [[A]], i32 [[TMP0]]
				; CHECK-NEXT: [[NEXT_GEP4:%.]] = getelementptr half, half [[B]], i32 [[INDEX]]
				; CHECK-NEXT: [[TMP1:%.]] = bitcast half [[NEXT_GEP]] to <16 x half>*
				; CHECK-NEXT: [[WIDE_VEC:%.]] = load <16 x half>, <16 x half> [[TMP1]], align 4
				; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x half> [[WIDE_VEC]], <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
				; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP3:%.]] = bitcast half [[NEXT_GEP4]] to <8 x half>*
				; CHECK-NEXT: store <8 x half> [[TMP2]], <8 x half>* [[TMP3]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
				; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992
				; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !20
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_09:%.]] = phi half [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_07:%.]] = phi half [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP5:%.]] = load half, half [[A_ADDR_09]], align 4
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds half, half* [[A_ADDR_09]], i32 2
				; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP5]], [[Y]]
				; CHECK-NEXT: store half [[ADD]], half* [[B_ADDR_07]], align 4
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds half, half* [[B_ADDR_07]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop !21
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body
				for.body:
				%A.addr.09 = phi half* [ %add.ptr, %for.body ], [ %A, %entry ]
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
				%B.addr.07 = phi half* [ %incdec.ptr, %for.body ], [ %B, %entry ]
				%0 = load half, half* %A.addr.09, align 4
				%add.ptr = getelementptr inbounds half, half* %A.addr.09, i32 2
				%add = fadd fast half %0, %y
				store half %add, half* %B.addr.07, align 4
				%incdec.ptr = getelementptr inbounds half, half* %B.addr.07, i32 1
				%inc = add nuw nsw i32 %i.08, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				define hidden void @pointer_phi_v4half_add3(half* noalias nocapture readonly %A, half* noalias nocapture %B, half %y) {
				; CHECK-LABEL: @pointer_phi_v4half_add3(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_09:%.]] = phi half [ [[ADD_PTR:%.]], [[FOR_BODY]] ], [ [[A:%.]], [[ENTRY:%.*]] ]
				; CHECK-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
				; CHECK-NEXT: [[B_ADDR_07:%.]] = phi half [ [[INCDEC_PTR:%.]], [[FOR_BODY]] ], [ [[B:%.]], [[ENTRY]] ]
				; CHECK-NEXT: [[TMP0:%.]] = load half, half [[A_ADDR_09]], align 4
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds half, half* [[A_ADDR_09]], i32 3
				; CHECK-NEXT: [[ADD:%.]] = fadd fast half [[TMP0]], [[Y:%.]]
				; CHECK-NEXT: store half [[ADD]], half* [[B_ADDR_07]], align 4
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds half, half* [[B_ADDR_07]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]]
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body
				for.body:
				%A.addr.09 = phi half* [ %add.ptr, %for.body ], [ %A, %entry ]
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
				%B.addr.07 = phi half* [ %incdec.ptr, %for.body ], [ %B, %entry ]
				%0 = load half, half* %A.addr.09, align 4
				%add.ptr = getelementptr inbounds half, half* %A.addr.09, i32 3
				%add = fadd fast half %0, %y
				store half %add, half* %B.addr.07, align 4
				%incdec.ptr = getelementptr inbounds half, half* %B.addr.07, i32 1
				%inc = add nuw nsw i32 %i.08, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body
				end:
				ret void
				}

				!0 = distinct !{!0, !1}
				!1 = !{!"llvm.loop.interleave.count", i32 2}

				define hidden void @pointer_phi_v4i32_uf2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %n, i32 %y) {
				; CHECK-LABEL: @pointer_phi_v4i32_uf2(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr i32, i32 [[A:%.*]], i32 59952
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr i32, i32 [[B:%.*]], i32 9992
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <4 x i32> undef, i32 [[Y:%.]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[POINTER_PHI:%.]] = phi i32 [ [[A]], [[ENTRY:%.]] ], [ [[PTR_IND:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.]] = getelementptr i32, i32 [[POINTER_PHI]], <4 x i32> <i32 0, i32 6, i32 12, i32 18>
				; CHECK-NEXT: [[TMP1:%.]] = getelementptr i32, i32 [[POINTER_PHI]], <4 x i32> <i32 24, i32 30, i32 36, i32 42>
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr i32, i32 [[B]], i32 [[INDEX]]
				; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
				; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
				; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT7]]
				; CHECK-NEXT: [[TMP4:%.]] = bitcast i32 [[NEXT_GEP]] to <4 x i32>*
				; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4
				; CHECK-NEXT: [[TMP5:%.]] = getelementptr i32, i32 [[NEXT_GEP]], i32 4
				; CHECK-NEXT: [[TMP6:%.]] = bitcast i32 [[TMP5]] to <4 x i32>*
				; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP6]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
				; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9992
				; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 48
				; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !22
				; CHECK: for.cond.cleanup:
				; CHECK-NEXT: ret void
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_08:%.]] = phi i32 [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[I_07:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 9992, [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_06:%.]] = phi i32 [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP8:%.]] = load i32, i32 [[A_ADDR_08]], align 4
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_08]], i32 6
				; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[Y]]
				; CHECK-NEXT: store i32 [[ADD]], i32* [[B_ADDR_06]], align 4
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_06]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop !23
				;

				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%A.addr.08 = phi i32* [ %A, %entry ], [ %add.ptr, %for.body ]
				%i.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%B.addr.06 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
				%0 = load i32, i32* %A.addr.08, align 4
				%add.ptr = getelementptr inbounds i32, i32* %A.addr.08, i32 6
				%add = add nsw i32 %0, %y
				store i32 %add, i32* %B.addr.06, align 4
				%incdec.ptr = getelementptr inbounds i32, i32* %B.addr.06, i32 1
				%inc = add nuw nsw i32 %i.07, 1
				%exitcond = icmp eq i32 %inc, 10000
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !0
				}

				!2 = distinct !{!2, !3}
				!3 = !{!"llvm.loop.interleave.count", i32 4}

				define hidden void @pointer_phi_v4i32_uf4(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %n, i32 %y) {
				; CHECK-LABEL: @pointer_phi_v4i32_uf4(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr i32, i32 [[A:%.*]], i32 59904
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr i32, i32 [[B:%.*]], i32 9984
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.]] = insertelement <4 x i32> undef, i32 [[Y:%.]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT12]], <4 x i32> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i32> undef, i32 [[Y]], i32 0
				; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT14]], <4 x i32> undef, <4 x i32> zeroinitializer
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[POINTER_PHI:%.]] = phi i32 [ [[A]], [[ENTRY:%.]] ], [ [[PTR_IND:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.]] = getelementptr i32, i32 [[POINTER_PHI]], <4 x i32> <i32 0, i32 6, i32 12, i32 18>
				; CHECK-NEXT: [[TMP1:%.]] = getelementptr i32, i32 [[POINTER_PHI]], <4 x i32> <i32 24, i32 30, i32 36, i32 42>
				; CHECK-NEXT: [[TMP2:%.]] = getelementptr i32, i32 [[POINTER_PHI]], <4 x i32> <i32 48, i32 54, i32 60, i32 66>
				; CHECK-NEXT: [[TMP3:%.]] = getelementptr i32, i32 [[POINTER_PHI]], <4 x i32> <i32 72, i32 78, i32 84, i32 90>
				; CHECK-NEXT: [[NEXT_GEP:%.]] = getelementptr i32, i32 [[B]], i32 [[INDEX]]
				; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32> [[TMP0]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
				; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
				; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
				; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
				; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
				; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT11]]
				; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT13]]
				; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT15]]
				; CHECK-NEXT: [[TMP8:%.]] = bitcast i32 [[NEXT_GEP]] to <4 x i32>*
				; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP8]], align 4
				; CHECK-NEXT: [[TMP9:%.]] = getelementptr i32, i32 [[NEXT_GEP]], i32 4
				; CHECK-NEXT: [[TMP10:%.]] = bitcast i32 [[TMP9]] to <4 x i32>*
				; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP10]], align 4
				; CHECK-NEXT: [[TMP11:%.]] = getelementptr i32, i32 [[NEXT_GEP]], i32 8
				; CHECK-NEXT: [[TMP12:%.]] = bitcast i32 [[TMP11]] to <4 x i32>*
				; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP12]], align 4
				; CHECK-NEXT: [[TMP13:%.]] = getelementptr i32, i32 [[NEXT_GEP]], i32 12
				; CHECK-NEXT: [[TMP14:%.]] = bitcast i32 [[TMP13]] to <4 x i32>*
				; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP14]], align 4
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
				; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9984
				; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 96
				; CHECK-NEXT: br i1 [[TMP15]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop !24
				; CHECK: for.cond.cleanup:
				; CHECK-NEXT: ret void
				; CHECK: for.body:
				; CHECK-NEXT: [[A_ADDR_08:%.]] = phi i32 [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[I_07:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 9984, [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[B_ADDR_06:%.]] = phi i32 [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP16:%.]] = load i32, i32 [[A_ADDR_08]], align 4
				; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[A_ADDR_08]], i32 6
				; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[Y]]
				; CHECK-NEXT: store i32 [[ADD]], i32* [[B_ADDR_06]], align 4
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_06]], i32 1
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop !25
				;
				entry:
				br label %for.body

				for.cond.cleanup:
				ret void

				for.body:
				%A.addr.08 = phi i32* [ %A, %entry ], [ %add.ptr, %for.body ]
				%i.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%B.addr.06 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
				%0 = load i32, i32* %A.addr.08, align 4
				%add.ptr = getelementptr inbounds i32, i32* %A.addr.08, i32 6
				%add = add nsw i32 %0, %y
				store i32 %add, i32* %B.addr.06, align 4
				%incdec.ptr = getelementptr inbounds i32, i32* %B.addr.06, i32 1
				%inc = add nuw nsw i32 %i.07, 1
				%exitcond = icmp eq i32 %inc, 10000
				br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !2
				}

				define hidden void @mult_ptr_iv(i8* noalias nocapture readonly %x, i8* noalias nocapture %z) {
				; CHECK-LABEL: @mult_ptr_iv(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[SCEVGEP:%.]] = getelementptr i8, i8 [[Z:%.*]], i32 3000
				; CHECK-NEXT: [[SCEVGEP1:%.]] = getelementptr i8, i8 [[X:%.*]], i32 3000
				; CHECK-NEXT: [[BOUND0:%.]] = icmp ugt i8 [[SCEVGEP1]], [[Z]]
				; CHECK-NEXT: [[BOUND1:%.]] = icmp ugt i8 [[SCEVGEP]], [[X]]
				; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
				; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.]], label [[VECTOR_PH:%.]]
				; CHECK: vector.ph:
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr i8, i8 [[X]], i32 3000
				; CHECK-NEXT: [[IND_END3:%.]] = getelementptr i8, i8 [[Z]], i32 3000
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[POINTER_PHI:%.]] = phi i8 [ [[X]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[POINTER_PHI5:%.]] = phi i8 [ [[Z]], [[VECTOR_PH]] ], [ [[PTR_IND6:%.*]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[INDEX:%.]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.]], [[VECTOR_BODY]] ]
				; CHECK-NEXT: [[TMP0:%.]] = getelementptr i8, i8 [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
				; CHECK-NEXT: [[TMP1:%.]] = getelementptr i8, i8 [[POINTER_PHI5]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
				; CHECK-NEXT: [[TMP2:%.]] = getelementptr inbounds i8, <4 x i8> [[TMP0]], i32 1
				; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8> [[TMP0]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
				; CHECK-NEXT: [[TMP3:%.]] = getelementptr inbounds i8, <4 x i8> [[TMP0]], i32 2
				; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8> [[TMP2]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
				; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8> [[TMP3]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef), !alias.scope !26
				; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], <i8 10, i8 10, i8 10, i8 10>
				; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER7]]
				; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER8]]
				; CHECK-NEXT: [[TMP7:%.]] = getelementptr inbounds i8, <4 x i8> [[TMP1]], i32 1
				; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP4]], <4 x i8*> [[TMP1]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
				; CHECK-NEXT: [[TMP8:%.]] = getelementptr inbounds i8, <4 x i8> [[TMP1]], i32 2
				; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP5]], <4 x i8*> [[TMP7]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
				; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP6]], <4 x i8*> [[TMP8]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !29, !noalias !26
				; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
				; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
				; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i32 12
				; CHECK-NEXT: [[PTR_IND6]] = getelementptr i8, i8* [[POINTER_PHI5]], i32 12
				; CHECK-NEXT: br i1 [[TMP9]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop !31
				; CHECK: for.body:
				; CHECK-NEXT: [[X_ADDR_050:%.]] = phi i8 [ [[INCDEC_PTR2:%.]], [[FOR_BODY]] ], [ [[X]], [[ENTRY:%.]] ]
				; CHECK-NEXT: [[Z_ADDR_049:%.]] = phi i8 [ [[INCDEC_PTR34:%.*]], [[FOR_BODY]] ], [ [[Z]], [[ENTRY]] ]
				; CHECK-NEXT: [[I_048:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
				; CHECK-NEXT: [[INCDEC_PTR:%.]] = getelementptr inbounds i8, i8 [[X_ADDR_050]], i32 1
				; CHECK-NEXT: [[TMP10:%.]] = load i8, i8 [[X_ADDR_050]], align 1
				; CHECK-NEXT: [[INCDEC_PTR1:%.]] = getelementptr inbounds i8, i8 [[X_ADDR_050]], i32 2
				; CHECK-NEXT: [[TMP11:%.]] = load i8, i8 [[INCDEC_PTR]], align 1
				; CHECK-NEXT: [[INCDEC_PTR2]] = getelementptr inbounds i8, i8* [[X_ADDR_050]], i32 3
				; CHECK-NEXT: [[TMP12:%.]] = load i8, i8 [[INCDEC_PTR1]], align 1
				; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP10]], 10
				; CHECK-NEXT: [[MUL1:%.*]] = mul i8 [[TMP10]], [[TMP11]]
				; CHECK-NEXT: [[MUL2:%.*]] = mul i8 [[TMP10]], [[TMP12]]
				; CHECK-NEXT: [[INCDEC_PTR32:%.]] = getelementptr inbounds i8, i8 [[Z_ADDR_049]], i32 1
				; CHECK-NEXT: store i8 [[MUL]], i8* [[Z_ADDR_049]], align 1
				; CHECK-NEXT: [[INCDEC_PTR33:%.]] = getelementptr inbounds i8, i8 [[Z_ADDR_049]], i32 2
				; CHECK-NEXT: store i8 [[MUL1]], i8* [[INCDEC_PTR32]], align 1
				; CHECK-NEXT: [[INCDEC_PTR34]] = getelementptr inbounds i8, i8* [[Z_ADDR_049]], i32 3
				; CHECK-NEXT: store i8 [[MUL2]], i8* [[INCDEC_PTR33]], align 1
				; CHECK-NEXT: [[INC]] = add nuw i32 [[I_048]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[FOR_BODY]], !llvm.loop !32
				; CHECK: end:
				; CHECK-NEXT: ret void
				;
				entry:
				br label %for.body

				for.body:
				%x.addr.050 = phi i8* [ %incdec.ptr2, %for.body ], [ %x, %entry ]
				%z.addr.049 = phi i8* [ %incdec.ptr34, %for.body ], [ %z, %entry ]
				%i.048 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
				%incdec.ptr = getelementptr inbounds i8, i8* %x.addr.050, i32 1
				%0 = load i8, i8* %x.addr.050, align 1
				%incdec.ptr1 = getelementptr inbounds i8, i8* %x.addr.050, i32 2
				%1 = load i8, i8* %incdec.ptr, align 1
				%incdec.ptr2 = getelementptr inbounds i8, i8* %x.addr.050, i32 3
				%2 = load i8, i8* %incdec.ptr1, align 1
				%conv = zext i8 %0 to i32
				%mul = mul nuw nsw i32 %conv, 10
				%conv1 = zext i8 %1 to i32
				%conv2 = zext i8 %2 to i32
				%mul1 = mul nuw nsw i32 %conv, %conv1
				%mul2 = mul nuw nsw i32 %conv, %conv2
				%conv3 = trunc i32 %mul to i8
				%conv4 = trunc i32 %mul1 to i8
				%conv5 = trunc i32 %mul2 to i8
				%incdec.ptr32 = getelementptr inbounds i8, i8* %z.addr.049, i32 1
				store i8 %conv3, i8* %z.addr.049, align 1
				%incdec.ptr33 = getelementptr inbounds i8, i8* %z.addr.049, i32 2
				store i8 %conv4, i8* %incdec.ptr32, align 1
				%incdec.ptr34 = getelementptr inbounds i8, i8* %z.addr.049, i32 3
				store i8 %conv5, i8* %incdec.ptr33, align 1
				%inc = add nuw i32 %i.048, 1
				%exitcond = icmp eq i32 %inc, 1000
				br i1 %exitcond, label %end, label %for.body

				end:
				ret void
				}

llvm/test/Transforms/LoopVectorize/pointer-induction.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S \| FileCheck %s
				target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"


				dmgreenUnsubmitted Done Reply Inline Actions I think if you use x86 as a target (and needs it for the costing), the test needs to go into test/Transforms/LoopVectorize/X86 in case the target is not compiled in. dmgreen: I think if you use x86 as a target (and needs it for the costing), the test needs to go into…
				fhahnUnsubmitted Done Reply Inline Actions It looks like the options above actually force vectorization with a certain factor. In that case, it Is probably best to remove the triple. I'd also consider just checking the loop-vectorize output (without -dce -instcombine), if it is not too messy, as it makes the test more prone to break when something changes in instcombine. Also, it might be possible to only specifically check the IR related to the generated induction, rather than autogenerating the checks, which include a lot of relatively irrelevant stuff. fhahn: It looks like the options above actually force vectorization with a certain factor. In that…
				anwelAuthorUnsubmitted Done Reply Inline Actions Thanks for the feedback, I don't have much experience writing opt tests so your advice is very welcome. I have removed the triple and the meta data, after checking that we don't need them, and reduced the checks to `vector.ph`, `vector.body` and the loop latch that changes the induction variable. anwel: Thanks for the feedback, I don't have much experience writing opt tests so your advice is very…
				anwelAuthorUnsubmitted Done Reply Inline Actions I thought it did need the target information to behave in the right way, but apparently I was mistaken - so no relocation necessary, I removed the target. anwel: I thought it did need the target information to behave in the right way, but apparently I was…
				; Function Attrs: nofree norecurse nounwind
				define void @a(i8* readnone %b) {
				; CHECK-LABEL: @a(
				dmgreenUnsubmitted Done Reply Inline Actions Also some of this might be able to be cleaned up, like the local_unnamed_addr, the metadata and all/most(?) of the attributes. dmgreen: Also some of this might be able to be cleaned up, like the local_unnamed_addr, the metadata and…
				anwelAuthorUnsubmitted Done Reply Inline Actions Should be a lot cleaner now. anwel: Should be a lot cleaner now.
				; CHECK: vector.ph:
				; CHECK-NEXT: [[N_MOD_VF:%.]] = urem i64 [[TMP0:%.]], 4
				; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
				; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[N_VEC]], -1
				; CHECK-NEXT: [[IND_END:%.]] = getelementptr i8, i8 null, i64 [[TMP1]]
				; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
				; CHECK: vector.body:
				; CHECK-NEXT: [[POINTER_PHI:%.]] = phi i8 [ null, %vector.ph ], [ [[PTR_IND:%.*]], %pred.store.continue7 ]
				; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.]], %pred.store.continue7 ]
				; CHECK-NEXT: [[TMP2:%.]] = getelementptr i8, i8 [[POINTER_PHI]], <4 x i64> <i64 0, i64 -1, i64 -2, i64 -3>
				; CHECK-NEXT: [[TMP3:%.]] = getelementptr inbounds i8, <4 x i8> [[TMP2]], i64 -1
				; CHECK-NEXT: [[TMP4:%.]] = extractelement <4 x i8> [[TMP3]], i32 0
				; CHECK-NEXT: [[TMP5:%.]] = getelementptr i8, i8 [[TMP4]], i32 0
				; CHECK-NEXT: [[TMP6:%.]] = getelementptr i8, i8 [[TMP5]], i32 -3
				; CHECK-NEXT: [[TMP7:%.]] = bitcast i8 [[TMP6]] to <4 x i8>*
				; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <4 x i8>, <4 x i8> [[TMP7]], align 1
				; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
				; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i8> [[REVERSE]], zeroinitializer
				; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], <i1 true, i1 true, i1 true, i1 true>
				; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0
				; CHECK: pred.store.continue7:
				; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
				; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
				; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 -4

				entry:
				%cmp.not4 = icmp eq i8* %b, null
				br i1 %cmp.not4, label %for.cond.cleanup, label %for.body.preheader

				for.body.preheader: ; preds = %entry
				br label %for.body

				for.cond.cleanup.loopexit: ; preds = %if.end
				br label %for.cond.cleanup

				for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
				ret void

				for.body: ; preds = %for.body.preheader, %if.end
				%c.05 = phi i8* [ %incdec.ptr, %if.end ], [ null, %for.body.preheader ]
				%incdec.ptr = getelementptr inbounds i8, i8* %c.05, i64 -1
				%0 = load i8, i8* %incdec.ptr, align 1
				%tobool.not = icmp eq i8 %0, 0
				br i1 %tobool.not, label %if.end, label %if.then

				if.then: ; preds = %for.body
				store i8 95, i8* %incdec.ptr, align 1
				br label %if.end

				if.end: ; preds = %for.body, %if.then
				%cmp.not = icmp eq i8* %incdec.ptr, %b
				br i1 %cmp.not, label %for.cond.cleanup.loopexit, label %for.body
				}

This is an archive of the discontinued LLVM Phabricator instance.

[LV] Enable the LoopVectorizer to create pointer inductions
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 278737

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll

llvm/test/Transforms/LoopVectorize/pointer-induction.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LV] Enable the LoopVectorizer to create pointer inductionsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 278737

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll

llvm/test/Transforms/LoopVectorize/pointer-induction.ll

[LV] Enable the LoopVectorizer to create pointer inductions
ClosedPublic