Diff 344751

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Show First 20 Lines • Show All 83 Lines • ▼ Show 20 Lines	void foldExtExtCmp(ExtractElementInst Ext0, ExtractElementInst Ext1,
Instruction &I);		Instruction &I);
void foldExtExtBinop(ExtractElementInst Ext0, ExtractElementInst Ext1,		void foldExtExtBinop(ExtractElementInst Ext0, ExtractElementInst Ext1,
Instruction &I);		Instruction &I);
bool foldExtractExtract(Instruction &I);		bool foldExtractExtract(Instruction &I);
bool foldBitcastShuf(Instruction &I);		bool foldBitcastShuf(Instruction &I);
bool scalarizeBinopOrCmp(Instruction &I);		bool scalarizeBinopOrCmp(Instruction &I);
bool foldExtractedCmps(Instruction &I);		bool foldExtractedCmps(Instruction &I);
bool foldSingleElementStore(Instruction &I);		bool foldSingleElementStore(Instruction &I);
		bool scalarizeLoadExtract(Instruction &I);
};		};
} // namespace		} // namespace

static void replaceValue(Value &Old, Value &New) {		static void replaceValue(Value &Old, Value &New) {
Old.replaceAllUsesWith(&New);		Old.replaceAllUsesWith(&New);
New.takeName(&Old);		New.takeName(&Old);
}		}

bool VectorCombine::vectorizeLoadInsert(Instruction &I) {		bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// Match insert into fixed vector of scalar value.		// Match insert into fixed vector of scalar value.
// TODO: Handle non-zero insert index.		// TODO: Handle non-zero insert index.
auto *Ty = dyn_cast<FixedVectorType>(I.getType());		auto *Ty = dyn_cast<FixedVectorType>(I.getType());
Value *Scalar;		Value *Scalar;
if (!Ty \|\| !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) \|\|		if (!Ty \|\| !match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) \|\|
!Scalar->hasOneUse())		!Scalar->hasOneUse())
return false;		return false;

// Optionally match an extract from another vector.		// Optionally match an extract from another vector.
Value *X;		Value *X;
bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));		bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
if (!HasExtract)		if (!HasExtract)
X = Scalar;		X = Scalar;

// Match source value as load of scalar or vector.		// Match source value as load of scalar or vector.
// Do not vectorize scalar load (widening) if atomic/volatile or under		// Do not vectorize scalar load (widening) if atomic/volatile or under
// asan/hwasan/memtag/tsan. The widened load may load data from dirty regions		// asan/hwasan/memtag/tsan. The widened load may load data from dirty regions
// or create data races non-existent in the source.		// or create data races non-existent in the source.
		spatelUnsubmitted Not Done Reply Inline Actions I didn't check the bot failure, but we dealt with a previous sanitizer failure with an additional predicate here. spatel: I didn't check the bot failure, but we dealt with a previous sanitizer failure with an…
		fhahnAuthorUnsubmitted Done Reply Inline Actions I think the issues was that `scalarizeLoadExtract` was called after `foldSingleElementStore`, which may remove instructions. Should be fixed by 4e8c28b6fbec fhahn: I think the issues was that `scalarizeLoadExtract` was called after `foldSingleElementStore`…
auto *Load = dyn_cast<LoadInst>(X);		auto *Load = dyn_cast<LoadInst>(X);
if (!Load \|\| !Load->isSimple() \|\| !Load->hasOneUse() \|\|		if (!Load \|\| !Load->isSimple() \|\| !Load->hasOneUse() \|\|
Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) \|\|		Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) \|\|
mustSuppressSpeculation(*Load))		mustSuppressSpeculation(*Load))
return false;		return false;

const DataLayout &DL = I.getModule()->getDataLayout();		const DataLayout &DL = I.getModule()->getDataLayout();
Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();		Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
▲ Show 20 Lines • Show All 639 Lines • ▼ Show 20 Lines	static bool isMemModifiedBetween(BasicBlock::iterator Begin,
const MemoryLocation &Loc, AAResults &AA) {		const MemoryLocation &Loc, AAResults &AA) {
unsigned NumScanned = 0;		unsigned NumScanned = 0;
return std::any_of(Begin, End, [&](const Instruction &Instr) {		return std::any_of(Begin, End, [&](const Instruction &Instr) {
return isModSet(AA.getModRefInfo(&Instr, Loc)) \|\|		return isModSet(AA.getModRefInfo(&Instr, Loc)) \|\|
++NumScanned > MaxInstrsToScan;		++NumScanned > MaxInstrsToScan;
});		});
}		}

		/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
		/// Idx. \p Idx must access a valid vector element.
		static bool canScalarizeAccess(FixedVectorType VecTy, ConstantInt Idx) {
		return Idx->getValue().ult(VecTy->getNumElements());
		}

// Combine patterns like:		// Combine patterns like:
// %0 = load <4 x i32>, <4 x i32>* %a		// %0 = load <4 x i32>, <4 x i32>* %a
// %1 = insertelement <4 x i32> %0, i32 %b, i32 1		// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
// store <4 x i32> %1, <4 x i32>* %a		// store <4 x i32> %1, <4 x i32>* %a
// to:		// to:
// %0 = bitcast <4 x i32>* %a to i32*		// %0 = bitcast <4 x i32>* %a to i32*
// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1		// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
// store i32 %b, i32* %1		// store i32 %b, i32* %1
bool VectorCombine::foldSingleElementStore(Instruction &I) {		bool VectorCombine::foldSingleElementStore(Instruction &I) {
StoreInst *SI = dyn_cast<StoreInst>(&I);		StoreInst *SI = dyn_cast<StoreInst>(&I);
if (!SI \|\| !SI->isSimple() \|\|		if (!SI \|\| !SI->isSimple() \|\|
!isa<FixedVectorType>(SI->getValueOperand()->getType()))		!isa<FixedVectorType>(SI->getValueOperand()->getType()))
return false;		return false;

// TODO: Combine more complicated patterns (multiple insert) by referencing		// TODO: Combine more complicated patterns (multiple insert) by referencing
// TargetTransformInfo.		// TargetTransformInfo.
Instruction *Source;		Instruction *Source;
Value *NewElement;		Value *NewElement;
ConstantInt *Idx;		ConstantInt *Idx;
if (!match(SI->getValueOperand(),		if (!match(SI->getValueOperand(),
m_InsertElt(m_Instruction(Source), m_Value(NewElement),		m_InsertElt(m_Instruction(Source), m_Value(NewElement),
m_ConstantInt(Idx))))		m_ConstantInt(Idx))))
return false;		return false;

if (auto *Load = dyn_cast<LoadInst>(Source)) {		if (auto *Load = dyn_cast<LoadInst>(Source)) {
auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());		auto VecTy = cast<FixedVectorType>(SI->getValueOperand()->getType());
const DataLayout &DL = I.getModule()->getDataLayout();		const DataLayout &DL = I.getModule()->getDataLayout();
Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();		Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
// Don't optimize for atomic/volatile load or store. Ensure memory is not		// Don't optimize for atomic/volatile load or store. Ensure memory is not
// modified between, vector type matches store size, and index is inbounds.		// modified between, vector type matches store size, and index is inbounds.
		lebedev.riUnsubmitted Not Done Reply Inline Actions Since we would be extracting from the original value, why do we care if it would have been overwritten sometimes later? Can't we just emit all loads right where the wide load was? lebedev.ri: Since we would be extracting from the original value, why do we care if it would have been…
		fhahnAuthorUnsubmitted Done Reply Inline Actions Can't we just emit all loads right where the wide load was? We could, if the indices are all defined before the wide load. That's not the case in the cases I am most interested in. At the moment the scalar loads are inserted at the point of the extracts, but if the index is defined before the wide load, we could create the narrow loads next to the widened loads. But that may be good as a follow up extensions? fhahn: > Can't we just emit all loads right where the wide load was? We could, if the indices are…
if (!Load->isSimple() \|\| Load->getParent() != SI->getParent() \|\|		if (!Load->isSimple() \|\| Load->getParent() != SI->getParent() \|\|
!DL.typeSizeEqualsStoreSize(Load->getType()) \|\|		!DL.typeSizeEqualsStoreSize(Load->getType()) \|\|
		lebedev.riUnsubmitted Not Done Reply Inline Actions Do we expect that all the CSE have happened by now? lebedev.ri: Do we expect that all the CSE have happened by now?
		fhahnAuthorUnsubmitted Done Reply Inline Actions Most of it yes, but not for the code that the SLP vectorizer creates. We don't run dedicated CSE later either, just instcombine. fhahn: Most of it yes, but not for the code that the SLP vectorizer creates. We don't run dedicated…
Idx->uge(VecTy->getNumElements()) \|\|		!canScalarizeAccess(VecTy, Idx) \|\|
SrcAddr != SI->getPointerOperand()->stripPointerCasts() \|\|		SrcAddr != SI->getPointerOperand()->stripPointerCasts() \|\|
isMemModifiedBetween(Load->getIterator(), SI->getIterator(),		isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
MemoryLocation::get(SI), AA))		MemoryLocation::get(SI), AA))
return false;		return false;

Value *GEP = GetElementPtrInst::CreateInBounds(		Value *GEP = GetElementPtrInst::CreateInBounds(
SI->getPointerOperand(), {ConstantInt::get(Idx->getType(), 0), Idx});		SI->getPointerOperand(), {ConstantInt::get(Idx->getType(), 0), Idx});
Builder.Insert(GEP);		Builder.Insert(GEP);
StoreInst *NSI = Builder.CreateStore(NewElement, GEP);		StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
NSI->copyMetadata(*SI);		NSI->copyMetadata(*SI);
if (SI->getAlign() < NSI->getAlign())		if (SI->getAlign() < NSI->getAlign())
NSI->setAlignment(SI->getAlign());		NSI->setAlignment(SI->getAlign());
replaceValue(I, *NSI);		replaceValue(I, *NSI);
// Need erasing the store manually.		// Need erasing the store manually.
I.eraseFromParent();		I.eraseFromParent();
return true;		return true;
}		}

return false;		return false;
}		}

		/// Try to scalarize vector loads feeding extractelement instructions.
		bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
		Value *Ptr;
		ConstantInt *Idx;
		if (!match(&I, m_ExtractElt(m_Load(m_Value(Ptr)), m_ConstantInt(Idx))))
		return false;

		auto *LI = cast<LoadInst>(I.getOperand(0));
		const DataLayout &DL = I.getModule()->getDataLayout();
		if (LI->isVolatile() \|\| !DL.typeSizeEqualsStoreSize(LI->getType()))
		return false;

		auto *FixedVT = dyn_cast<FixedVectorType>(LI->getType());
		if (!FixedVT)
		return false;

		if (!canScalarizeAccess(FixedVT, Idx))
		return false;

		InstructionCost OriginalCost = TTI.getMemoryOpCost(
		Instruction::Load, LI->getType(), Align(LI->getAlignment()),
		LI->getPointerAddressSpace());
		InstructionCost ScalarizedCost = 0;

		Instruction *LastCheckedInst = LI;
		unsigned NumInstChecked = 0;
		// Check if all users of the load are extracts with no memory modifications
		// between the load and the extract. Compute the cost of both the original
		// code and the scalarized version.
		for (User *U : LI->users()) {
		auto *UI = dyn_cast<ExtractElementInst>(U);
		if (!UI \|\| UI->getParent() != LI->getParent())
		return false;

		// Check if any instruction between the load and the extract may modify
		// memory.
		if (LastCheckedInst->comesBefore(UI)) {
		for (Instruction &I :
		make_range(std::next(LI->getIterator()), UI->getIterator())) {
		// Bail out if we reached the check limit or the instruction may write
		// to memory.
		if (NumInstChecked == 6 \|\| I.mayWriteToMemory())
		RKSimonUnsubmitted Not Done Reply Inline Actions Pull out the magic number (MaxInstChecked = 6 ?) RKSimon: Pull out the magic number (MaxInstChecked = 6 ?)
		fhahnAuthorUnsubmitted Done Reply Inline Actions I updated the code to use `MaxInstrsToScan` which was added by an earlier patch. fhahn: I updated the code to use `MaxInstrsToScan` which was added by an earlier patch.
		return false;
		NumInstChecked++;
		}
		}

		if (!LastCheckedInst)
		LastCheckedInst = UI;
		else if (LastCheckedInst->comesBefore(UI))
		LastCheckedInst = UI;

		auto *Index = dyn_cast<ConstantInt>(UI->getOperand(1));
		OriginalCost +=
		TTI.getVectorInstrCost(Instruction::ExtractElement, LI->getType(),
		Index ? Index->getZExtValue() : -1);
		ScalarizedCost +=
		TTI.getMemoryOpCost(Instruction::Load, FixedVT->getElementType(),
		Align(1), LI->getPointerAddressSpace());
		ScalarizedCost += TTI.getAddressComputationCost(FixedVT->getElementType());
		}

		if (ScalarizedCost >= OriginalCost)
		return false;

		// Replace extracts with narrow scalar loads.
		for (User *U : LI->users()) {
		RKSimonUnsubmitted Not Done Reply Inline Actions Why are you forcing the alignment ? RKSimon: Why are you forcing the alignment ?
		fhahnAuthorUnsubmitted Done Reply Inline Actions I think we have to update the alignment of the scalar load, because after applying an offset to the pointer it may not be aligned as specified on the original load. I think we should be able to use the common alignment between the alignment on the load and the scalar type. WDYT? fhahn: I think we have to update the alignment of the scalar load, because after applying an offset to…
		RKSimonUnsubmitted Not Done Reply Inline Actions Yes, using a common alignment would make more sense RKSimon: Yes, using a common alignment would make more sense
		fhahnAuthorUnsubmitted Done Reply Inline Actions I updated the code to use the common alignment. fhahn: I updated the code to use the common alignment.
		auto *EI = cast<ExtractElementInst>(U);
		IRBuilder<>::InsertPointGuard Guard(Builder);
		Builder.SetInsertPoint(EI);
		Value *GEP = Builder.CreateInBoundsGEP(
		FixedVT, Ptr, {Builder.getInt32(0), EI->getOperand(1)});
		auto *NewLoad = cast<LoadInst>(Builder.CreateLoad(
		FixedVT->getElementType(), GEP, EI->getName() + ".scalar"));
		NewLoad->setAlignment(Align(1));
		replaceValue(EI, NewLoad);
		}

		return true;
		}

/// This is the entry point for all transforms. Pass manager differences are		/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.		/// handled in the callers of this function.
bool VectorCombine::run() {		bool VectorCombine::run() {
if (DisableVectorCombine)		if (DisableVectorCombine)
return false;		return false;

// Don't attempt vectorization if the target does not support vectors.		// Don't attempt vectorization if the target does not support vectors.
if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/Vector/ true)))		if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/Vector/ true)))
Show All 10 Lines	for (Instruction &I : make_early_inc_range(BB)) {
continue;		continue;
Builder.SetInsertPoint(&I);		Builder.SetInsertPoint(&I);
MadeChange \|= vectorizeLoadInsert(I);		MadeChange \|= vectorizeLoadInsert(I);
MadeChange \|= foldExtractExtract(I);		MadeChange \|= foldExtractExtract(I);
MadeChange \|= foldBitcastShuf(I);		MadeChange \|= foldBitcastShuf(I);
MadeChange \|= scalarizeBinopOrCmp(I);		MadeChange \|= scalarizeBinopOrCmp(I);
MadeChange \|= foldExtractedCmps(I);		MadeChange \|= foldExtractedCmps(I);
MadeChange \|= foldSingleElementStore(I);		MadeChange \|= foldSingleElementStore(I);
		MadeChange \|= scalarizeLoadExtract(I);
}		}
}		}

// We're done with transforms, so remove dead instructions.		// We're done with transforms, so remove dead instructions.
if (MadeChange)		if (MadeChange)
for (BasicBlock &BB : F)		for (BasicBlock &BB : F)
SimplifyInstructionsInBlock(&BB);		SimplifyInstructionsInBlock(&BB);

▲ Show 20 Lines • Show All 63 Lines • Show Last 20 Lines

llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -vector-combine -mtriple=arm64-apple-darwinos -S %s \| FileCheck %s		; RUN: opt -vector-combine -mtriple=arm64-apple-darwinos -S %s \| FileCheck %s

define i32 @load_extract_idx_0(<4 x i32>* %x) {		define i32 @load_extract_idx_0(<4 x i32>* %x) {
; CHECK-LABEL: @load_extract_idx_0(		; CHECK-LABEL: @load_extract_idx_0(
; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16		; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[X:%.*]], i32 0, i32 3
; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 3		; CHECK-NEXT: [[R:%.]] = load i32, i32 [[TMP1]], align 1
; CHECK-NEXT: ret i32 [[R]]		; CHECK-NEXT: ret i32 [[R]]
;		;
%lv = load <4 x i32>, <4 x i32>* %x		%lv = load <4 x i32>, <4 x i32>* %x
%r = extractelement <4 x i32> %lv, i32 3		%r = extractelement <4 x i32> %lv, i32 3
ret i32 %r		ret i32 %r
}		}

define i32 @load_extract_idx_1(<4 x i32>* %x) {		define i32 @load_extract_idx_1(<4 x i32>* %x) {
; CHECK-LABEL: @load_extract_idx_1(		; CHECK-LABEL: @load_extract_idx_1(
; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16		; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[X:%.*]], i32 0, i32 1
; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 1		; CHECK-NEXT: [[R:%.]] = load i32, i32 [[TMP1]], align 1
; CHECK-NEXT: ret i32 [[R]]		; CHECK-NEXT: ret i32 [[R]]
;		;
%lv = load <4 x i32>, <4 x i32>* %x		%lv = load <4 x i32>, <4 x i32>* %x
%r = extractelement <4 x i32> %lv, i32 1		%r = extractelement <4 x i32> %lv, i32 1
ret i32 %r		ret i32 %r
}		}

define i32 @load_extract_idx_2(<4 x i32>* %x) {		define i32 @load_extract_idx_2(<4 x i32>* %x) {
; CHECK-LABEL: @load_extract_idx_2(		; CHECK-LABEL: @load_extract_idx_2(
; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16		; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[X:%.*]], i32 0, i32 2
; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2		; CHECK-NEXT: [[R:%.]] = load i32, i32 [[TMP1]], align 1
; CHECK-NEXT: ret i32 [[R]]		; CHECK-NEXT: ret i32 [[R]]
;		;
%lv = load <4 x i32>, <4 x i32>* %x		%lv = load <4 x i32>, <4 x i32>* %x
%r = extractelement <4 x i32> %lv, i32 2		%r = extractelement <4 x i32> %lv, i32 2
ret i32 %r		ret i32 %r
}		}

define i32 @load_extract_idx_3(<4 x i32>* %x) {		define i32 @load_extract_idx_3(<4 x i32>* %x) {
; CHECK-LABEL: @load_extract_idx_3(		; CHECK-LABEL: @load_extract_idx_3(
; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16		; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[X:%.*]], i32 0, i32 3
; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 3		; CHECK-NEXT: [[R:%.]] = load i32, i32 [[TMP1]], align 1
; CHECK-NEXT: ret i32 [[R]]		; CHECK-NEXT: ret i32 [[R]]
;		;
%lv = load <4 x i32>, <4 x i32>* %x		%lv = load <4 x i32>, <4 x i32>* %x
%r = extractelement <4 x i32> %lv, i32 3		%r = extractelement <4 x i32> %lv, i32 3
ret i32 %r		ret i32 %r
}		}

		; Out-of-bounds index for extractelement, should not be converted to narrow
		; load, because it would introduce a dereference of a poison pointer.
		define i32 @load_extract_idx_4(<4 x i32>* %x) {
		; CHECK-LABEL: @load_extract_idx_4(
		; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16
		; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 4
		; CHECK-NEXT: ret i32 [[R]]
		;
		%lv = load <4 x i32>, <4 x i32>* %x
		%r = extractelement <4 x i32> %lv, i32 4
		ret i32 %r
		}

define i32 @load_extract_idx_var_i64(<4 x i32>* %x, i64 %idx) {		define i32 @load_extract_idx_var_i64(<4 x i32>* %x, i64 %idx) {
; CHECK-LABEL: @load_extract_idx_var_i64(		; CHECK-LABEL: @load_extract_idx_var_i64(
; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16		; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16
; CHECK-NEXT: [[R:%.]] = extractelement <4 x i32> [[LV]], i64 [[IDX:%.]]		; CHECK-NEXT: [[R:%.]] = extractelement <4 x i32> [[LV]], i64 [[IDX:%.]]
; CHECK-NEXT: ret i32 [[R]]		; CHECK-NEXT: ret i32 [[R]]
;		;
%lv = load <4 x i32>, <4 x i32>* %x		%lv = load <4 x i32>, <4 x i32>* %x
%r = extractelement <4 x i32> %lv, i64 %idx		%r = extractelement <4 x i32> %lv, i64 %idx
Show All 11 Lines	;
ret i32 %r		ret i32 %r
}		}

declare void @clobber()		declare void @clobber()

define i32 @load_extract_clobber_call_before(<4 x i32>* %x) {		define i32 @load_extract_clobber_call_before(<4 x i32>* %x) {
; CHECK-LABEL: @load_extract_clobber_call_before(		; CHECK-LABEL: @load_extract_clobber_call_before(
; CHECK-NEXT: call void @clobber()		; CHECK-NEXT: call void @clobber()
; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16		; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[X:%.*]], i32 0, i32 2
; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2		; CHECK-NEXT: [[R:%.]] = load i32, i32 [[TMP1]], align 1
; CHECK-NEXT: ret i32 [[R]]		; CHECK-NEXT: ret i32 [[R]]
;		;
call void @clobber()		call void @clobber()
%lv = load <4 x i32>, <4 x i32>* %x		%lv = load <4 x i32>, <4 x i32>* %x
%r = extractelement <4 x i32> %lv, i32 2		%r = extractelement <4 x i32> %lv, i32 2
ret i32 %r		ret i32 %r
}		}

define i32 @load_extract_clobber_call_between(<4 x i32>* %x) {		define i32 @load_extract_clobber_call_between(<4 x i32>* %x) {
; CHECK-LABEL: @load_extract_clobber_call_between(		; CHECK-LABEL: @load_extract_clobber_call_between(
; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16		; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16
; CHECK-NEXT: call void @clobber()		; CHECK-NEXT: call void @clobber()
; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2		; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2
; CHECK-NEXT: ret i32 [[R]]		; CHECK-NEXT: ret i32 [[R]]
;		;
%lv = load <4 x i32>, <4 x i32>* %x		%lv = load <4 x i32>, <4 x i32>* %x
call void @clobber()		call void @clobber()
%r = extractelement <4 x i32> %lv, i32 2		%r = extractelement <4 x i32> %lv, i32 2
ret i32 %r		ret i32 %r
}		}

define i32 @load_extract_clobber_call_after(<4 x i32>* %x) {		define i32 @load_extract_clobber_call_after(<4 x i32>* %x) {
; CHECK-LABEL: @load_extract_clobber_call_after(		; CHECK-LABEL: @load_extract_clobber_call_after(
; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16		; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[X:%.*]], i32 0, i32 2
; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2		; CHECK-NEXT: [[R:%.]] = load i32, i32 [[TMP1]], align 1
; CHECK-NEXT: call void @clobber()		; CHECK-NEXT: call void @clobber()
; CHECK-NEXT: ret i32 [[R]]		; CHECK-NEXT: ret i32 [[R]]
;		;
%lv = load <4 x i32>, <4 x i32>* %x		%lv = load <4 x i32>, <4 x i32>* %x
%r = extractelement <4 x i32> %lv, i32 2		%r = extractelement <4 x i32> %lv, i32 2
call void @clobber()		call void @clobber()
ret i32 %r		ret i32 %r
}		}

define i32 @load_extract_clobber_store_before(<4 x i32>* %x, i8* %y) {		define i32 @load_extract_clobber_store_before(<4 x i32>* %x, i8* %y) {
; CHECK-LABEL: @load_extract_clobber_store_before(		; CHECK-LABEL: @load_extract_clobber_store_before(
; CHECK-NEXT: store i8 0, i8* [[Y:%.*]], align 1		; CHECK-NEXT: store i8 0, i8* [[Y:%.*]], align 1
; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16		; CHECK-NEXT: [[TMP1:%.]] = getelementptr inbounds <4 x i32>, <4 x i32> [[X:%.*]], i32 0, i32 2
; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2		; CHECK-NEXT: [[R:%.]] = load i32, i32 [[TMP1]], align 1
; CHECK-NEXT: ret i32 [[R]]		; CHECK-NEXT: ret i32 [[R]]
;		;
store i8 0, i8* %y		store i8 0, i8* %y
%lv = load <4 x i32>, <4 x i32>* %x		%lv = load <4 x i32>, <4 x i32>* %x
%r = extractelement <4 x i32> %lv, i32 2		%r = extractelement <4 x i32> %lv, i32 2
ret i32 %r		ret i32 %r
}		}

▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines
; CHECK-NEXT: ret i31 [[R]]		; CHECK-NEXT: ret i31 [[R]]
;		;
%lv = load <4 x i31>, <4 x i31>* %x		%lv = load <4 x i31>, <4 x i31>* %x
%r = extractelement <4 x i31> %lv, i32 1		%r = extractelement <4 x i31> %lv, i32 1
ret i31 %r		ret i31 %r
}		}

; Scalarizing the load for multiple constant indices may not be profitable.		; Scalarizing the load for multiple constant indices may not be profitable.
define i32 @load_multiple_extracts_with_constant_idx(<4 x i32>* %x) {		define i32 @load_multiple_extracts_with_constant_idx(<4 x i32>* %x) {
		spatelUnsubmitted Not Done Reply Inline Actions Let me know if I'm overlooking it, but I don't see a positive test for this pattern (multiple extracts of a single load). Is there some vector type variant (float?) of this test that will transform for AArch64? If not, we might get x86 or some other target to trigger on this test as-is. spatel: Let me know if I'm overlooking it, but I don't see a positive test for this pattern (multiple…
		fhahnAuthorUnsubmitted Done Reply Inline Actions Yes I think this case was missing. I added `load_multiple_extracts_with_constant_idx_profitable` to cover that I think. fhahn: Yes I think this case was missing. I added `load_multiple_extracts_with_constant_idx_profitable…
; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(		; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16		; CHECK-NEXT: [[LV:%.]] = load <4 x i32>, <4 x i32> [[X:%.*]], align 16
; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>		; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]		; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
; CHECK-NEXT: [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0		; CHECK-NEXT: [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: ret i32 [[RES]]		; CHECK-NEXT: ret i32 [[RES]]
;		;
%lv = load <4 x i32>, <4 x i32>* %x		%lv = load <4 x i32>, <4 x i32>* %x
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines

llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll

Show First 20 Lines • Show All 624 Lines • ▼ Show 20 Lines	;
ret <8 x i32> %r		ret <8 x i32> %r
}		}

; Can't safely load the offset vector, but can load+shuffle if it is profitable.		; Can't safely load the offset vector, but can load+shuffle if it is profitable.

define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) {		define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) {
; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(		; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
; SSE2-NEXT: [[GEP:%.]] = getelementptr inbounds <2 x i16>, <2 x i16> [[P:%.*]], i64 1		; SSE2-NEXT: [[GEP:%.]] = getelementptr inbounds <2 x i16>, <2 x i16> [[P:%.*]], i64 1
; SSE2-NEXT: [[L:%.]] = load <2 x i16>, <2 x i16> [[GEP]], align 8		; SSE2-NEXT: [[TMP1:%.]] = getelementptr inbounds <2 x i16>, <2 x i16> [[GEP]], i32 0, i32 0
; SSE2-NEXT: [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0		; SSE2-NEXT: [[S:%.]] = load i16, i16 [[TMP1]], align 1
; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0		; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
; SSE2-NEXT: ret <8 x i16> [[R]]		; SSE2-NEXT: ret <8 x i16> [[R]]
;		;
; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(		; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
; AVX2-NEXT: [[TMP1:%.]] = bitcast <2 x i16> [[P:%.]] to <8 x i16>		; AVX2-NEXT: [[TMP1:%.]] = bitcast <2 x i16> [[P:%.]] to <8 x i16>
; AVX2-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> [[TMP1]], align 4		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> [[TMP1]], align 1
; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>		; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: ret <8 x i16> [[R]]		; AVX2-NEXT: ret <8 x i16> [[R]]
;		;
%gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1		%gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1
%l = load <2 x i16>, <2 x i16>* %gep, align 8		%l = load <2 x i16>, <2 x i16>* %gep, align 8
		RKSimonUnsubmitted Not Done Reply Inline Actions The arg says its align 1m, but here we're saying align 8? RKSimon: The arg says its align 1m, but here we're saying align 8?
		fhahnAuthorUnsubmitted Done Reply Inline Actions Hm that's indeed odd. I think the align of the load would take precedence and `align 1 dereferenceable(16)` are probably not needed for the test. But I think we need to choose a conservative alignment for the scalarized load, that's why `align 1` is used for the scalar load. Does that answer your question? fhahn: Hm that's indeed odd. I think the align of the load would take precedence and `align 1…
		spatelUnsubmitted Not Done Reply Inline Actions Some of these tests are intentionally bizarre -- blame me :) -- with respect to alignment because the rules are not clear. Looking at the code in vectorizeLoadInsert(), we have: // Use the greater of the alignment on the load or its source pointer. Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment); So on the AVX2 target, this test is going through 2 transforms now instead of just 1: This patch kicks in 1st and converts the short vector load to scalar load with `align 2` -- presumably because that's the datalayout-ABI-specified setting. Should the original load's `align 8` attribute override that? The scalar load is converted to the wider vector load again in vectorizeLoadInsert(), but now the alignment was reduced, so we end up with the test diff. spatel: Some of these tests are intentionally bizarre -- blame me :) -- with respect to alignment…
		fhahnAuthorUnsubmitted Done Reply Inline Actions Thanks for the explanation. I think once we lowered to alignment on the load, it is not really safe to raise it based on the alignment of the pointer type. But if we scalarize with index 0, we load from the original pointer and we can preserve the original alignment. I updated the patch and the AVX2 changes are now gone. fhahn: Thanks for the explanation. I think once we lowered to alignment on the load, it is not really…
%s = extractelement <2 x i16> %l, i32 0		%s = extractelement <2 x i16> %l, i32 0
%r = insertelement <8 x i16> poison, i16 %s, i64 0		%r = insertelement <8 x i16> poison, i16 %s, i64 0
ret <8 x i16> %r		ret <8 x i16> %r
}		}

llvm/test/Transforms/VectorCombine/X86/load.ll

Show First 20 Lines • Show All 624 Lines • ▼ Show 20 Lines	;
ret <8 x i32> %r		ret <8 x i32> %r
}		}

; Can't safely load the offset vector, but can load+shuffle if it is profitable.		; Can't safely load the offset vector, but can load+shuffle if it is profitable.

define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) {		define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) {
; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(		; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
; SSE2-NEXT: [[GEP:%.]] = getelementptr inbounds <2 x i16>, <2 x i16> [[P:%.*]], i64 1		; SSE2-NEXT: [[GEP:%.]] = getelementptr inbounds <2 x i16>, <2 x i16> [[P:%.*]], i64 1
; SSE2-NEXT: [[L:%.]] = load <2 x i16>, <2 x i16> [[GEP]], align 8		; SSE2-NEXT: [[TMP1:%.]] = getelementptr inbounds <2 x i16>, <2 x i16> [[GEP]], i32 0, i32 0
; SSE2-NEXT: [[S:%.*]] = extractelement <2 x i16> [[L]], i32 0		; SSE2-NEXT: [[S:%.]] = load i16, i16 [[TMP1]], align 1
; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0		; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
; SSE2-NEXT: ret <8 x i16> [[R]]		; SSE2-NEXT: ret <8 x i16> [[R]]
;		;
; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(		; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
; AVX2-NEXT: [[TMP1:%.]] = bitcast <2 x i16> [[P:%.]] to <8 x i16>		; AVX2-NEXT: [[TMP1:%.]] = bitcast <2 x i16> [[P:%.]] to <8 x i16>
; AVX2-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> [[TMP1]], align 4		; AVX2-NEXT: [[TMP2:%.]] = load <8 x i16>, <8 x i16> [[TMP1]], align 1
; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>		; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; AVX2-NEXT: ret <8 x i16> [[R]]		; AVX2-NEXT: ret <8 x i16> [[R]]
;		;
%gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1		%gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1
%l = load <2 x i16>, <2 x i16>* %gep, align 8		%l = load <2 x i16>, <2 x i16>* %gep, align 8
%s = extractelement <2 x i16> %l, i32 0		%s = extractelement <2 x i16> %l, i32 0
%r = insertelement <8 x i16> undef, i16 %s, i64 0		%r = insertelement <8 x i16> undef, i16 %s, i64 0
ret <8 x i16> %r		ret <8 x i16> %r
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[VectorCombine] Scalarize vector load/extract.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 344751

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll

llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll

llvm/test/Transforms/VectorCombine/X86/load.ll

This is an archive of the discontinued LLVM Phabricator instance.

[VectorCombine] Scalarize vector load/extract.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 344751

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll

llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll

llvm/test/Transforms/VectorCombine/X86/load.ll

[VectorCombine] Scalarize vector load/extract.
ClosedPublic