Diff 364137

llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp

Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	struct SVEIntrinsicOpts : public ModulePass {

bool runOnModule(Module &M) override;		bool runOnModule(Module &M) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;		void getAnalysisUsage(AnalysisUsage &AU) const override;

private:		private:
bool coalescePTrueIntrinsicCalls(BasicBlock &BB,		bool coalescePTrueIntrinsicCalls(BasicBlock &BB,
SmallSetVector<IntrinsicInst *, 4> &PTrues);		SmallSetVector<IntrinsicInst *, 4> &PTrues);
bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);		bool optimizePTrueIntrinsicCalls(SmallSetVector<Function *, 4> &Functions);
		bool optimizePredicateStore(Instruction *I);
		bool optimizePredicateLoad(Instruction *I);

		bool optimizeInstructions(SmallSetVector<Function *, 4> &Functions);

/// Operates at the function-scope. I.e., optimizations are applied local to		/// Operates at the function-scope. I.e., optimizations are applied local to
/// the functions themselves.		/// the functions themselves.
bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);		bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
};		};
} // end anonymous namespace		} // end anonymous namespace

void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {		void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
▲ Show 20 Lines • Show All 201 Lines • ▼ Show 20 Lines	for (auto &BB : *F) {
Changed \|= coalescePTrueIntrinsicCalls(BB, SVAllPTrues);		Changed \|= coalescePTrueIntrinsicCalls(BB, SVAllPTrues);
Changed \|= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues);		Changed \|= coalescePTrueIntrinsicCalls(BB, SVPow2PTrues);
}		}
}		}

return Changed;		return Changed;
}		}

		// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
		// scalable stores as late as possible
		bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) {
		paulwalker-armUnsubmitted Done Reply Inline Actions Is this name representative? I think `optimizePredicateStore` is more in keeping with what is going on. paulwalker-arm: Is this name representative? I think `optimizePredicateStore` is more in keeping with what is…
		auto *F = I->getFunction();
		paulwalker-armUnsubmitted Done Reply Inline Actions Not that bothered but you could pass this into the function given optimizeInstructions already knows it. If you do keep the lookup then can you use `I->getFunction()`. paulwalker-arm: Not that bothered but you could pass this into the function given optimizeInstructions already…
		auto Attr = F->getFnAttribute(Attribute::VScaleRange);
		if (!Attr.isValid())
		return false;

		unsigned MinVScale, MaxVScale;
		std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs();
		// The transform needs to know the exact runtime length of scalable vectors
		efriedmaUnsubmitted Done Reply Inline Actions Do you need to check getMaxSVEVectorSizeInBits() somewhere? This code could probably use some comments explaining why it's checking various conditions. efriedma: Do you need to check getMaxSVEVectorSizeInBits() somewhere? This code could probably use some…
		if (MinVScale != MaxVScale \|\| MinVScale == 0)
		paulwalker-armUnsubmitted Done Reply Inline Actions These are really MinVScale/MaxVScale. paulwalker-arm: These are really MinVScale/MaxVScale.
		return false;

		junparserUnsubmitted Done Reply Inline Actions Why do we disable differing vscale min/max? I donot see any difference between vscale_range(4,0) and vscale_range(4,6). junparser: Why do we disable differing vscale min/max? I donot see any difference between vscale_range(4…
		paulwalker-armUnsubmitted Done Reply Inline Actions I think this needs `\|\| MinSVEVectorSize == 0` to cover the vscale_range(0,0) case. Perhaps worth adding a comment along the lines of "The transform needs to know the exact runtime length of scalable vectors". To bring this point home perhaps it's worth introducing `unsigned SVEVectorSize = MinVScale128;`, but I'll leave you to decide if it's worth it. paulwalker-arm:* I think this needs `\|\| MinSVEVectorSize == 0` to cover the vscale_range(0,0) case. Perhaps…
		auto *PredType =
		ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16);
		auto *FixedPredType =
		FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2);

		// If we have a store..
		auto *Store = dyn_cast<StoreInst>(I);
		if (!Store \|\| !Store->isSimple())
		paulwalker-armUnsubmitted Done Reply Inline Actions To keep things simple (pun intended) I suggest adding `\|\| !Store->isSimple())` so that we only transform ordinary stores. paulwalker-arm: To keep things simple (pun intended) I suggest adding ` \|\| !Store->isSimple())` so that we only…
		return false;

		// ..that is storing a predicate vector sized worth of bits..
		if (Store->getOperand(0)->getType() != FixedPredType)
		paulwalker-armUnsubmitted Done Reply Inline Actions Is the `dyn_cast` necessary? paulwalker-arm: Is the `dyn_cast` necessary?
		return false;

		// ..where the value stored comes from a vector extract..
		auto *IntrI = dyn_cast<IntrinsicInst>(Store->getOperand(0));
		if (!IntrI \|\|
		IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract)
		return false;
		paulwalker-armUnsubmitted Done Reply Inline Actions I've got a feeling this needs to be more specific otherwise we'll introduce endianness issues. Specially we know predicate load/store instructions are byte based and so we should only allow the transform when the fixed length load/store instructions are also byte based. Given this is all very specific I imagine you'll end up with something like `if (Store->getOperand(0)->getType() == PrecalculatedTy)` paulwalker-arm: I've got a feeling this needs to be more specific otherwise we'll introduce endianness issues.

		// ..that is extracting from index 0..
		if (!cast<ConstantInt>(IntrI->getOperand(1))->isZero())
		return false;

		// ..where the value being extract from comes from a bitcast
		auto *BitCast = dyn_cast<BitCastInst>(IntrI->getOperand(0));
		paulwalker-armUnsubmitted Done Reply Inline Actions Is this restriction strictly necessary? I'm thinking we might have multiple stores of the same value so unless there is a real affect on code quality I'd rather not artificially restrict the transform. paulwalker-arm: Is this restriction strictly necessary? I'm thinking we might have multiple stores of the same…
		if (!BitCast)
		return false;

		// ..and the bitcast is casting from predicate type
		if (BitCast->getOperand(0)->getType() != PredType)
		paulwalker-armUnsubmitted Done Reply Inline Actions `cast<ConstantInt>(IntrI->getOperand(1))->isZero()`? paulwalker-arm: `cast<ConstantInt>(IntrI->getOperand(1))->isZero()`?
		return false;

		IRBuilder<> Builder(I->getContext());
		Builder.SetInsertPoint(I);

		auto *PtrBitCast = Builder.CreateBitCast(
		Store->getPointerOperand(),
		PredType->getPointerTo(Store->getPointerAddressSpace()));
		Builder.CreateStore(BitCast->getOperand(0), PtrBitCast);

		Store->eraseFromParent();
		paulwalker-armUnsubmitted Not Done Reply Inline Actions Not sure why I've only just spotted this but does this do anything? Can there be any uses of a store? paulwalker-arm: Not sure why I've only just spotted this but does this do anything? Can there be any uses of a…
		if (IntrI->getNumUses() == 0)
		IntrI->eraseFromParent();
		if (BitCast->getNumUses() == 0)
		BitCast->eraseFromParent();

		return true;
		}

		// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce
		// scalable loads as late as possible
		bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) {
		auto *F = I->getFunction();
		auto Attr = F->getFnAttribute(Attribute::VScaleRange);
		if (!Attr.isValid())
		paulwalker-armUnsubmitted Done Reply Inline Actions As with optimizePredicateStore. paulwalker-arm: As with optimizePredicateStore.
		return false;

		unsigned MinVScale, MaxVScale;
		std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs();
		// The transform needs to know the exact runtime length of scalable vectors
		if (MinVScale != MaxVScale \|\| MinVScale == 0)
		paulwalker-armUnsubmitted Done Reply Inline Actions I don't see any test to ensure the old bitcast is dead? When combined with my previous comment you'll likely want: if (IntrI->getNumUses() == 0) IntrI->eraseFromParent(); if (BitCast->getNumUses() == 0) BitCast->eraseFromParent(); paulwalker-arm: I don't see any test to ensure the old bitcast is dead? When combined with my previous comment…
		return false;

		auto *PredType =
		ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16);
		auto *FixedPredType =
		FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2);
		peterwaller-armUnsubmitted Done Reply Inline Actions Nit: s/loads/stores/ peterwaller-arm: Nit: s/loads/stores/

		// If we have a bitcast..
		auto *BitCast = dyn_cast<BitCastInst>(I);
		if (!BitCast \|\| BitCast->getType() != PredType)
		paulwalker-armUnsubmitted Done Reply Inline Actions I haven't looked but I imagine this'll have similar issues as raise for optimizePredicateVectorExtract. paulwalker-arm: I haven't looked but I imagine this'll have similar issues as raise for…
		return false;
		paulwalker-armUnsubmitted Done Reply Inline Actions Although possible, is this worth caring about? paulwalker-arm: Although possible, is this worth caring about?

		// ..whose operand is a vector_insert..
		auto *IntrI = dyn_cast<IntrinsicInst>(BitCast->getOperand(0));
		if (!IntrI \|\|
		IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert)
		return false;

		// ..that is inserting into index zero of an undef vector..
		if (!isa<UndefValue>(IntrI->getOperand(0)) \|\|
		!cast<ConstantInt>(IntrI->getOperand(2))->isZero())
		return false;

		// ..where the value inserted comes from a load..
		auto *Load = dyn_cast<LoadInst>(IntrI->getOperand(1));
		if (!Load \|\| !Load->isSimple())
		return false;

		// ..that is loading a predicate vector sized worth of bits..
		if (Load->getType() != FixedPredType)
		return false;
		paulwalker-armUnsubmitted Done Reply Inline Actions As with the store case, we should only allow ordinary loads. paulwalker-arm: As with the store case, we should only allow ordinary loads.

		IRBuilder<> Builder(I->getContext());
		Builder.SetInsertPoint(Load);

		paulwalker-armUnsubmitted Done Reply Inline Actions Is the dyn_cast necessary? paulwalker-arm: Is the dyn_cast necessary?
		auto *PtrBitCast = Builder.CreateBitCast(
		Load->getPointerOperand(),
		PredType->getPointerTo(Load->getPointerAddressSpace()));
		auto *LoadPred = Builder.CreateLoad(PredType, PtrBitCast);
		paulwalker-armUnsubmitted Done Reply Inline Actions The insertion point needs to be where the original load is to ensure the new load maintains the same load/store ordering. paulwalker-arm: The insertion point needs to be where the original load is to ensure the new load maintains the…

		BitCast->replaceAllUsesWith(LoadPred);
		BitCast->eraseFromParent();
		if (IntrI->getNumUses() == 0)
		IntrI->eraseFromParent();
		if (Load->getNumUses() == 0)
		Load->eraseFromParent();

		return true;
		}

		bool SVEIntrinsicOpts::optimizeInstructions(
		SmallSetVector<Function *, 4> &Functions) {
		bool Changed = false;

		for (auto *F : Functions) {
		DominatorTree DT = &getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();

		// Traverse the DT with an rpo walk so we see defs before uses, allowing
		// simplification to be done incrementally.
		BasicBlock *Root = DT->getRoot();
		ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
		for (auto *BB : RPOT) {
		for (Instruction &I : make_early_inc_range(*BB)) {
		switch (I.getOpcode()) {
		case Instruction::Store:
		Changed \|= optimizePredicateStore(&I);
		break;
		case Instruction::BitCast:
		Changed \|= optimizePredicateLoad(&I);
		break;
		}
		}
		}
		}

		return Changed;
		}

bool SVEIntrinsicOpts::optimizeFunctions(		bool SVEIntrinsicOpts::optimizeFunctions(
SmallSetVector<Function *, 4> &Functions) {		SmallSetVector<Function *, 4> &Functions) {
bool Changed = false;		bool Changed = false;

Changed \|= optimizePTrueIntrinsicCalls(Functions);		Changed \|= optimizePTrueIntrinsicCalls(Functions);
		Changed \|= optimizeInstructions(Functions);
		junparserUnsubmitted Done Reply Inline Actions shall we need to handle vector_extract as well? junparser: shall we need to handle vector_extract as well?

return Changed;		return Changed;
}		}

bool SVEIntrinsicOpts::runOnModule(Module &M) {		bool SVEIntrinsicOpts::runOnModule(Module &M) {
bool Changed = false;		bool Changed = false;
SmallSetVector<Function *, 4> Functions;		SmallSetVector<Function *, 4> Functions;

// Check for SVE intrinsic declarations first so that we only iterate over		// Check for SVE intrinsic declarations first so that we only iterate over
// relevant functions. Where an appropriate declaration is found, store the		// relevant functions. Where an appropriate declaration is found, store the
// function(s) where it is used so we can target these only.		// function(s) where it is used so we can target these only.
for (auto &F : M.getFunctionList()) {		for (auto &F : M.getFunctionList()) {
if (!F.isDeclaration())		if (!F.isDeclaration())
continue;		continue;

switch (F.getIntrinsicID()) {		switch (F.getIntrinsicID()) {
		case Intrinsic::experimental_vector_extract:
		case Intrinsic::experimental_vector_insert:
case Intrinsic::aarch64_sve_ptrue:		case Intrinsic::aarch64_sve_ptrue:
for (User *U : F.users())		for (User *U : F.users())
Functions.insert(cast<Instruction>(U)->getFunction());		Functions.insert(cast<Instruction>(U)->getFunction());
break;		break;
default:		default:
break;		break;
}		}
}		}

if (!Functions.empty())		if (!Functions.empty())
Changed \|= optimizeFunctions(Functions);		Changed \|= optimizeFunctions(Functions);

return Changed;		return Changed;
}		}

llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll

This file was added.

				; RUN: opt -S -aarch64-sve-intrinsic-opts < %s \| FileCheck %s

				target triple = "aarch64-unknown-linux-gnu"

				define void @pred_store_v2i8(<vscale x 16 x i1> %pred, <2 x i8>* %addr) #0 {
				; CHECK-LABEL: @pred_store_v2i8(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast <2 x i8> %addr to <vscale x 16 x i1>*
				; CHECK-NEXT: store <vscale x 16 x i1> %pred, <vscale x 16 x i1>* [[TMP1]]
				; CHECK-NEXT: ret void
				%bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
				%extract = tail call <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
				store <2 x i8> %extract, <2 x i8>* %addr, align 4
				ret void
				}

				define void @pred_store_v4i8(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #1 {
				; CHECK-LABEL: @pred_store_v4i8(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast <4 x i8> %addr to <vscale x 16 x i1>*
				; CHECK-NEXT: store <vscale x 16 x i1> %pred, <vscale x 16 x i1>* [[TMP1]]
				; CHECK-NEXT: ret void
				%bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
				%extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
				store <4 x i8> %extract, <4 x i8>* %addr, align 4
				ret void
				}

				define void @pred_store_v8i8(<vscale x 16 x i1> %pred, <8 x i8>* %addr) #2 {
				; CHECK-LABEL: @pred_store_v8i8(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast <8 x i8> %addr to <vscale x 16 x i1>*
				; CHECK-NEXT: store <vscale x 16 x i1> %pred, <vscale x 16 x i1>* [[TMP1]]
				; CHECK-NEXT: ret void
				%bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
				%extract = tail call <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
				store <8 x i8> %extract, <8 x i8>* %addr, align 4
				ret void
				}


				; Check that too small of a vscale prevents optimization
				define void @pred_store_neg1(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #0 {
				; CHECK-LABEL: @pred_store_neg1(
				; CHECK: call <4 x i8> @llvm.experimental.vector.extract
				%bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
				%extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
				store <4 x i8> %extract, <4 x i8>* %addr, align 4
				ret void
				}

				; Check that too large of a vscale prevents optimization
				define void @pred_store_neg2(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #2 {
				; CHECK-LABEL: @pred_store_neg2(
				; CHECK: call <4 x i8> @llvm.experimental.vector.extract
				%bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
				%extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
				store <4 x i8> %extract, <4 x i8>* %addr, align 4
				ret void
				}

				; Check that a non-zero index prevents optimization
				define void @pred_store_neg3(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #1 {
				; CHECK-LABEL: @pred_store_neg3(
				; CHECK: call <4 x i8> @llvm.experimental.vector.extract
				%bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
				%extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 4)
				store <4 x i8> %extract, <4 x i8>* %addr, align 4
				ret void
				}

				; Check that differing vscale min/max prevents optimization
				define void @pred_store_neg4(<vscale x 16 x i1> %pred, <4 x i8>* %addr) #3 {
				; CHECK-LABEL: @pred_store_neg4(
				; CHECK: call <4 x i8> @llvm.experimental.vector.extract
				%bitcast = bitcast <vscale x 16 x i1> %pred to <vscale x 2 x i8>
				%extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> %bitcast, i64 0)
				store <4 x i8> %extract, <4 x i8>* %addr, align 4
				ret void
				}

				declare <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8>, i64)
				declare <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8>, i64)
				declare <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8>, i64)

				attributes #0 = { "target-features"="+sve" vscale_range(1,1) }
				attributes #1 = { "target-features"="+sve" vscale_range(2,2) }
				attributes #2 = { "target-features"="+sve" vscale_range(4,4) }
				attributes #3 = { "target-features"="+sve" vscale_range(2,4) }

llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll

This file was added.

				; RUN: opt -S -aarch64-sve-intrinsic-opts < %s \| FileCheck %s

				target triple = "aarch64-unknown-linux-gnu"

				define <vscale x 16 x i1> @pred_load_v2i8(<2 x i8>* %addr) #0 {
				; CHECK-LABEL: @pred_load_v2i8(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast <2 x i8> %addr to <vscale x 16 x i1>*
				; CHECK-NEXT: [[TMP2:%.]] = load <vscale x 16 x i1>, <vscale x 16 x i1> [[TMP1]]
				; CHECK-NEXT: ret <vscale x 16 x i1> [[TMP2]]
				%load = load <2 x i8>, <2 x i8>* %addr, align 4
				%insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> %load, i64 0)
				paulwalker-armUnsubmitted Done Reply Inline Actions Given the new load placement issue I mentioned above it's worth having the load in a separate basic block in order to validate that fix. paulwalker-arm: Given the new load placement issue I mentioned above it's worth having the load in a separate…
				%ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
				ret <vscale x 16 x i1> %ret
				}

				define <vscale x 16 x i1> @pred_load_v4i8(<4 x i8>* %addr) #1 {
				; CHECK-LABEL: @pred_load_v4i8(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast <4 x i8> %addr to <vscale x 16 x i1>*
				; CHECK-NEXT: [[TMP2:%.]] = load <vscale x 16 x i1>, <vscale x 16 x i1> [[TMP1]]
				; CHECK-NEXT: ret <vscale x 16 x i1> [[TMP2]]
				%load = load <4 x i8>, <4 x i8>* %addr, align 4
				%insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
				%ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
				ret <vscale x 16 x i1> %ret
				}

				define <vscale x 16 x i1> @pred_load_v8i8(<8 x i8>* %addr) #2 {
				; CHECK-LABEL: @pred_load_v8i8(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast <8 x i8> %addr to <vscale x 16 x i1>*
				; CHECK-NEXT: [[TMP2:%.]] = load <vscale x 16 x i1>, <vscale x 16 x i1> [[TMP1]]
				; CHECK-NEXT: ret <vscale x 16 x i1> [[TMP2]]
				%load = load <8 x i8>, <8 x i8>* %addr, align 4
				%insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> %load, i64 0)
				%ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
				ret <vscale x 16 x i1> %ret
				}

				; Ensure the insertion point is at the load
				define <vscale x 16 x i1> @pred_load_insertion_point(<2 x i8>* %addr) #0 {
				; CHECK-LABEL: @pred_load_insertion_point(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[TMP1:%.]] = bitcast <2 x i8> %addr to <vscale x 16 x i1>*
				; CHECK-NEXT: [[TMP2:%.]] = load <vscale x 16 x i1>, <vscale x 16 x i1> [[TMP1]]
				; CHECK-NEXT: br label %bb1
				; CHECK: bb1:
				; CHECK-NEXT: ret <vscale x 16 x i1> [[TMP2]]
				entry:
				%load = load <2 x i8>, <2 x i8>* %addr, align 4
				br label %bb1

				bb1:
				%insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> %load, i64 0)
				%ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
				ret <vscale x 16 x i1> %ret
				}

				; Check that too small of a vscale prevents optimization
				define <vscale x 16 x i1> @pred_load_neg1(<4 x i8>* %addr) #0 {
				; CHECK-LABEL: @pred_load_neg1(
				; CHECK: call <vscale x 2 x i8> @llvm.experimental.vector.insert
				%load = load <4 x i8>, <4 x i8>* %addr, align 4
				%insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
				%ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
				ret <vscale x 16 x i1> %ret
				}
				paulwalker-armUnsubmitted Not Done Reply Inline Actions Not really relevant for this patch but it occurs to me that this IR is provably bogus if we wanted to add the necessary hooks into the verifier. paulwalker-arm: Not really relevant for this patch but it occurs to me that this IR is provably bogus if we…

				; Check that too large of a vscale prevents optimization
				define <vscale x 16 x i1> @pred_load_neg2(<4 x i8>* %addr) #2 {
				; CHECK-LABEL: @pred_load_neg2(
				; CHECK: call <vscale x 2 x i8> @llvm.experimental.vector.insert
				%load = load <4 x i8>, <4 x i8>* %addr, align 4
				%insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
				%ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
				ret <vscale x 16 x i1> %ret
				}

				; Check that a non-zero index prevents optimization
				define <vscale x 16 x i1> @pred_load_neg3(<4 x i8>* %addr) #1 {
				; CHECK-LABEL: @pred_load_neg3(
				; CHECK: call <vscale x 2 x i8> @llvm.experimental.vector.insert
				%load = load <4 x i8>, <4 x i8>* %addr, align 4
				%insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 4)
				%ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
				ret <vscale x 16 x i1> %ret
				}

				; Check that differing vscale min/max prevents optimization
				define <vscale x 16 x i1> @pred_load_neg4(<4 x i8>* %addr) #3 {
				; CHECK-LABEL: @pred_load_neg4(
				; CHECK: call <vscale x 2 x i8> @llvm.experimental.vector.insert
				%load = load <4 x i8>, <4 x i8>* %addr, align 4
				%insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> %load, i64 0)
				%ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
				ret <vscale x 16 x i1> %ret
				}

				; Check that insertion into a non-undef vector prevents optimization
				define <vscale x 16 x i1> @pred_load_neg5(<4 x i8>* %addr, <vscale x 2 x i8> %passthru) #1 {
				; CHECK-LABEL: @pred_load_neg5(
				; CHECK: call <vscale x 2 x i8> @llvm.experimental.vector.insert
				%load = load <4 x i8>, <4 x i8>* %addr, align 4
				%insert = tail call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> %passthru, <4 x i8> %load, i64 0)
				%ret = bitcast <vscale x 2 x i8> %insert to <vscale x 16 x i1>
				ret <vscale x 16 x i1> %ret
				}

				declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8>, <2 x i8>, i64)
				declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8>, <4 x i8>, i64)
				declare <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8>, <8 x i8>, i64)

				attributes #0 = { "target-features"="+sve" vscale_range(1,1) }
				attributes #1 = { "target-features"="+sve" vscale_range(2,2) }
				attributes #2 = { "target-features"="+sve" vscale_range(4,4) }
				attributes #3 = { "target-features"="+sve" vscale_range(2,4) }

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][SVE] Combine bitcasts to predicate types with vector inserts of loads
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 364137

llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp

llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll

llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][SVE] Combine bitcasts to predicate types with vector inserts of loadsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 364137

llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp

llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll

llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll

[AArch64][SVE] Combine bitcasts to predicate types with vector inserts of loads
ClosedPublic