This is an archive of the discontinued LLVM Phabricator instance.

[CodeGen] Complex deinterleaving vector splitting
AbandonedPublic

Authored by NickGuy on Jul 4 2022, 2:58 AM.

Download Raw Diff

Details

Reviewers

dmgreen
samtebbs

Summary

Adds support for splitting complex vectors so they can fit within vector registers

Diff Detail

Repository: rG LLVM Github Monorepo

Unit TestsFailed

	Time	Test
	250 ms	x64 debian > BOLT.runtime/X86::user-func-reorder.c
	100 ms	x64 debian > LLVM.CodeGen/ARM/ComplexArithmetic::complex-arithmetic-f16-add.ll
	3,780 ms	x64 debian > LLVM.CodeGen/ARM/ComplexArithmetic::complex-arithmetic-f16-mul.ll
	90 ms	x64 debian > LLVM.CodeGen/ARM/ComplexArithmetic::complex-arithmetic-f32-add.ll
	3,910 ms	x64 debian > LLVM.CodeGen/ARM/ComplexArithmetic::complex-arithmetic-f32-mul.ll
		View Full Test Results (6 Failed)

Event Timeline

NickGuy created this revision.Jul 4 2022, 2:58 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 4 2022, 2:58 AM

Herald added a subscriber: hiraditya. · View Herald Transcript

NickGuy requested review of this revision.Jul 4 2022, 2:58 AM

NickGuy added a parent revision: D114174: [ARM][CodeGen] Add support for complex deinterleaving.

NickGuy mentioned this in D114174: [ARM][CodeGen] Add support for complex deinterleaving.Jul 4 2022, 3:42 AM

Looks like I messed up my git-fu somewhere in preparing these patches. There are a few cleanup changes in this patch that should've been in a previous one. I'll try and clean that up when addressing other comments.

Harbormaster completed remote builds in B173526: Diff 442051.Jul 4 2022, 4:16 AM

Matt added a subscriber: Matt.Jul 4 2022, 10:26 AM

Can we do the splitting on the Target side of the boundary? So it gets asked to create a "v8f32 vcmul" from two values and generates a series for "extract-subvector; vcmul; insert-subvector" each of size v4i32. I don't think the pass/graph would actually need to change, it would just be up to the backend to generate something equivalent. Then the graph in simpler, and all the added shuffles would optimize away.

See lowerInterleavedLoad (https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/ARM/ARMISelLowering.cpp#L21400) and lowerInterleavedStore (https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/ARM/ARMISelLowering.cpp#L21548) for examples of that.

This was pushed over to be the responsibility of the target. Therefore this approach is no longer relevant.

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

ComplexDeinterleavingPass.cpp

229 lines

Diff 442051

llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp

Show All 36 Lines
//		//
// Replacement:		// Replacement:
// This step performs the necessary input wrangling (chasing values through		// This step performs the necessary input wrangling (chasing values through
// accumulators, shuffles, and other composite nodes) in order for the target to		// accumulators, shuffles, and other composite nodes) in order for the target to
// know what to generate. While some additional checks are performed at this		// know what to generate. While some additional checks are performed at this
// step, it is expected to finish successfully, while any errors should be		// step, it is expected to finish successfully, while any errors should be
// caught via asserts.		// caught via asserts.
//		//
		// Vector Splitting:
		// Vector Splitting is only employed if the resulting vectors would exceed the
		// width of a single vector register, and is mostly performed between the
		// Identification and Replacement steps. Each composite node is cloned and
		// annotated with a split index; the original being set to 0, and the clone
		// being set to 1. This index identifies whether the given node is operating on
		// the lower or higher portion of the original vector, and is used to restrict
		// CompositeNode lookups to the same side of the split. The vector splitting is
		// responsible for intercepting the loads with shuffles to get only the relevant
		// data for that split (e.g. Elements <0, 1, 2, 3> from an 8x vector for the
		// lower split), as well as rejoining the 2 split graphs into one at the end,
		// through the use of a concatenating shuffle.
		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "llvm/CodeGen/ComplexDeinterleavingPass.h"		#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
#include "llvm/ADT/Statistic.h"		#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetLibraryInfo.h"		#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"		#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetLowering.h"		#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetPassConfig.h"		#include "llvm/CodeGen/TargetPassConfig.h"
▲ Show 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	struct ComplexDeinterleavingCompositeNode {

ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op)		ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op)
: Operation(Op) {}		: Operation(Op) {}

private:		private:
friend class ComplexDeinterleavingGraph;		friend class ComplexDeinterleavingGraph;

public:		public:
		void SetSplit(unsigned Idx) {
		HasSplit = true;
		SplitIdx = Idx;
		}

SmallVector<Value *> getOperands() {		SmallVector<Value *> getOperands() {
SmallVector<Value *> Ops;		SmallVector<Value *> Ops;

for (const auto &Inst : ContainedInstructions) {		for (const auto &Inst : ContainedInstructions) {
for (Value *V : Inst->operands()) {		for (Value *V : Inst->operands()) {
auto *I = dyn_cast<Instruction>(V);		auto *I = dyn_cast<Instruction>(V);
if (!I \|\| !contains(I)) {		if (!I \|\| !contains(I)) {
Ops.push_back(V);		Ops.push_back(V);
continue;		continue;
}		}
}		}
}		}
return Ops;		return Ops;
}		}

Value *getOperand(unsigned Idx) { return getOperands()[Idx]; }		Value *getOperand(unsigned Idx) { return getOperands()[Idx]; }

unsigned getNumOperands() { return getOperands().size(); }		unsigned getNumOperands() { return getOperands().size(); }

SmallVector<Instruction *> ContainedInstructions;		SmallVector<Instruction *> ContainedInstructions;
Value *OutputNode = nullptr;		Value *OutputNode = nullptr;
Value *OriginalInput0 = nullptr;		Value *OriginalInput0 = nullptr;
Value *OriginalInput1 = nullptr;		Value *OriginalInput1 = nullptr;
Value *ReplacementNode = nullptr;		Value *ReplacementNode = nullptr;
bool IsTopLevel = false;
ComplexDeinterleavingOperation Operation;		ComplexDeinterleavingOperation Operation;

bool UsesNegation = false;		bool UsesNegation = false;
unsigned Rotation = 0;		unsigned Rotation = 0;
Value *Input0 = nullptr;		Value *Input0 = nullptr;
Value *Input1 = nullptr;		Value *Input1 = nullptr;
Value *Accumulator = nullptr;		Value *Accumulator = nullptr;
Value *Accumulatee = nullptr;		Value *Accumulatee = nullptr;

		bool HasSplit = false;
		int SplitIdx = -1;

void addInstruction(Instruction *I) { ContainedInstructions.push_back(I); }		void addInstruction(Instruction *I) { ContainedInstructions.push_back(I); }
bool contains(Instruction *I) {		bool contains(Instruction *I) {
if (I == ReplacementNode)		if (I == ReplacementNode)
return true;		return true;

return llvm::find(ContainedInstructions, I) != ContainedInstructions.end();		return llvm::find(ContainedInstructions, I) != ContainedInstructions.end();
}		}
};		};

class ComplexDeinterleavingGraph {		class ComplexDeinterleavingGraph {
private:		public:
using NodePtr = std::shared_ptr<ComplexDeinterleavingCompositeNode>;		using NodePtr = std::shared_ptr<ComplexDeinterleavingCompositeNode>;

		private:
SmallVector<Instruction *> Instructions;		SmallVector<Instruction *> Instructions;
SmallVector<NodePtr> CompositeNodes;		SmallVector<NodePtr> CompositeNodes;

llvm::TargetTransformInfo::TargetCostKind CostKind =		llvm::TargetTransformInfo::TargetCostKind CostKind =
llvm::TargetTransformInfo::TCK_Latency;		llvm::TargetTransformInfo::TCK_Latency;

InstructionCost CostOfIntrinsics;		InstructionCost CostOfIntrinsics;
		bool NeedsSplit = false;

		std::map<Value , Value *> ShuffleMapping;
		Value *splitLoadIfNecessary(ComplexDeinterleavingGraph::NodePtr Node,
		Value *V);

/// Determines the operating component of the given Value.		/// Determines the operating component of the given Value.
/// This is achieved by looking at the operating component of the Value's		/// This is achieved by looking at the operating component of the Value's
/// operands and, based on the instruction, evaluates what the resulting		/// operands and, based on the instruction, evaluates what the resulting
/// component would be.		/// component would be.
OperatingComponent getOperatingComponentOfValue(Value *V) {		OperatingComponent getOperatingComponentOfValue(Value *V) {
Instruction *I = dyn_cast_or_null<Instruction>(V);		Instruction *I = dyn_cast_or_null<Instruction>(V);
if (!I)		if (!I)
▲ Show 20 Lines • Show All 129 Lines • ▼ Show 20 Lines	Value followUseChain(Value V) {
if (V->hasOneUser())		if (V->hasOneUser())
return followUseChain(*V->user_begin());		return followUseChain(*V->user_begin());

// TODO handle multiple users, but how?		// TODO handle multiple users, but how?

return V;		return V;
}		}

Value getFinalInputReplacement(Instruction I) {		Value getFinalInputReplacement(Instruction I, int SplitIdx = -1) {
for (Value *V : I->operands()) {		for (Value *V : I->operands()) {
auto *Op = dyn_cast<Instruction>(V);		auto *Op = dyn_cast<Instruction>(V);
while (Op && shouldIgnoreValue(Op))		while (Op && shouldIgnoreValue(Op))
Op = dyn_cast<Instruction>(Op->getOperand(0));		Op = dyn_cast<Instruction>(Op->getOperand(0));
if (Op == nullptr)		if (Op == nullptr)
continue;		continue;

auto CN = getContainingComposite(Op);		auto CN = getContainingComposite(Op, SplitIdx);
if (CN == nullptr \|\| CN->ReplacementNode == nullptr)		if (CN == nullptr \|\| CN->ReplacementNode == nullptr)
continue;		continue;
return followUseChain(CN->ReplacementNode);		return followUseChain(CN->ReplacementNode);
}		}

return nullptr;		return nullptr;
}		}

Value getReplacement(Instruction I) {		Value getReplacement(Instruction I) {
if (!I)		if (!I)
return nullptr;		return nullptr;
auto CN = getContainingComposite(I);		auto CN = getContainingComposite(I);
if (CN == nullptr \|\| CN->ReplacementNode == nullptr)		if (CN == nullptr \|\| CN->ReplacementNode == nullptr)
return I;		return I;
return CN->ReplacementNode;		return CN->ReplacementNode;
}		}

std::shared_ptr<ComplexDeinterleavingCompositeNode>		std::shared_ptr<ComplexDeinterleavingCompositeNode>
prepareCompositeNode(ComplexDeinterleavingOperation Operation) {		prepareCompositeNode(ComplexDeinterleavingOperation Operation) {
return std::make_shared<ComplexDeinterleavingCompositeNode>(Operation);		return std::make_shared<ComplexDeinterleavingCompositeNode>(Operation);
}		}

		std::shared_ptr<ComplexDeinterleavingCompositeNode>
		cloneCompositeNode(NodePtr OtherNode) {
		auto NewNode = prepareCompositeNode(OtherNode->Operation);

		NewNode->ContainedInstructions.append(OtherNode->ContainedInstructions);
		NewNode->OutputNode = OtherNode->OutputNode;
		NewNode->OriginalInput0 = OtherNode->OriginalInput0;
		NewNode->OriginalInput1 = OtherNode->OriginalInput1;

		NewNode->UsesNegation = OtherNode->UsesNegation;
		NewNode->Rotation = OtherNode->Rotation;
		NewNode->Accumulator = OtherNode->Accumulator;
		NewNode->Accumulatee = OtherNode->Accumulatee;

		return NewNode;
		}

void		void
submitCompositeNode(std::shared_ptr<ComplexDeinterleavingCompositeNode> CN) {		submitCompositeNode(std::shared_ptr<ComplexDeinterleavingCompositeNode> CN) {
CompositeNodes.push_back(CN);		CompositeNodes.push_back(CN);
}		}

bool containsNode(Instruction *I) {		bool containsNode(Instruction *I) {
return llvm::find(Instructions, I) != Instructions.end();		return llvm::find(Instructions, I) != Instructions.end();
}		}
Show All 37 Lines	if (A->hasOneUser() && B->hasOneUser()) {

if (AUser == BUser)		if (AUser == BUser)
return true;		return true;
}		}

return haveSharedUses(A, B);		return haveSharedUses(A, B);
}		}

NodePtr getContainingComposite(Instruction *I) {		NodePtr getContainingComposite(Instruction *I, int SplitIdx = -1) {
if (I == nullptr)		if (I == nullptr)
return nullptr;		return nullptr;
for (const auto &CN : CompositeNodes) {		for (const auto &CN : CompositeNodes) {
		if (SplitIdx > -1 && CN->SplitIdx != SplitIdx)
		continue;

if (CN->contains(I))		if (CN->contains(I))
return CN;		return CN;
if (CN->ReplacementNode == I)		if (CN->ReplacementNode == I)
return CN;		return CN;
}		}
return nullptr;		return nullptr;
}		}

bool identifyCMulPartial(Instruction I, const TargetLowering TL,		bool identifyCMulPartial(Instruction I, const TargetLowering TL,
bool &ContinueIdentification);		bool &ContinueIdentification);
bool identifyOrphanedCMulPartial(Instruction I, Instruction J,		bool identifyOrphanedCMulPartial(Instruction I, Instruction J,
const TargetLowering *TL,		const TargetLowering *TL,
bool &ContinueIdentification);		bool &ContinueIdentification);
bool identifyCAdd(Instruction I, Instruction J, const TargetLowering *TL,		bool identifyCAdd(Instruction I, Instruction J, const TargetLowering *TL,
bool &ContinueIdentification);		bool &ContinueIdentification);

public:		public:
/// Step through the use-def chains to find all instruction nodes converging		/// Step through the use-def chains to find all instruction nodes converging
/// on \p I.		/// on \p I.
void discoverNodes(BasicBlock B, Instruction I);		void discoverNodes(BasicBlock B, Instruction I);
/// Iterate over the nodes and reducing them to complex nodes where possible.		/// Iterate over the nodes and reducing them to complex nodes where possible.
/// Returns false if the deinterleaving operation should be cancelled for the		/// Returns false if the deinterleaving operation should be cancelled for the
/// current graph.		/// current graph.
bool identifyNodes(const TargetLowering *TL);		bool identifyNodes(const TargetLowering *TL);

		/// If necessary, splits the nodes so that the operations can fit within a
		/// single vector
		void splitNodes(const TargetLowering *TL);

/// Perform the actual replacement of the underlying instruction graph.		/// Perform the actual replacement of the underlying instruction graph.
/// Returns false if the deinterleaving operation should be cancelled for the		/// Returns false if the deinterleaving operation should be cancelled for the
/// current graph.		/// current graph.
bool replaceNodes(const TargetLowering *TL);		bool replaceNodes(const TargetLowering *TL);
void getDeadRoots(SmallVector<Instruction *> &DeadInstRoots);		void getDeadRoots(SmallVector<Instruction *> &DeadInstRoots);
};		};

class ComplexDeinterleaving {		class ComplexDeinterleaving {
▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines	if ((match(I, m_FAdd(m_FMul(m_Value(), m_Value()),
auto *VTy = dyn_cast<FixedVectorType>(I->getType());		auto *VTy = dyn_cast<FixedVectorType>(I->getType());
if (!VTy)		if (!VTy)
return false;		return false;

auto *NewVTy =		auto *NewVTy =
FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2);		FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2);

if (!TL->isComplexDeinterleavingOperationSupported(		if (!TL->isComplexDeinterleavingOperationSupported(
ComplexDeinterleavingOperation::CMulPartial, NewVTy))		ComplexDeinterleavingOperation::CMulPartial, NewVTy)) {
		if (!TL->isComplexDeinterleavingOperationSupported(
		ComplexDeinterleavingOperation::CMulPartial, VTy))
return false;		return false;

LLVM_DEBUG(dbgs() << "Composite node built up from "; N->dump());		NeedsSplit = true;
auto CN =		}
prepareCompositeNode(llvm::ComplexDeinterleavingOperation::CMulPartial);
		auto CN = prepareCompositeNode(
		llvm::ComplexDeinterleavingOperation::CMulPartial);

auto *Op0 = cast<Instruction>(I->getOperand(0));		auto *Op0 = cast<Instruction>(I->getOperand(0));
auto *Op1 = cast<Instruction>(I->getOperand(1));		auto *Op1 = cast<Instruction>(I->getOperand(1));

CN->addInstruction(I);		CN->addInstruction(I);
CN->addInstruction(Op0);		CN->addInstruction(Op0);
CN->addInstruction(Op1);		CN->addInstruction(Op1);

Show All 22 Lines	for (Value *V : Op1->operands()) {
}		}
}		}

if (!ContainsNeg) {		if (!ContainsNeg) {
auto &Use = (*I->use_begin());		auto &Use = (*I->use_begin());
if (I->getOpcode() == Instruction::FSub) {		if (I->getOpcode() == Instruction::FSub) {
if (isa<ShuffleVectorInst>(Use.getUser()) && Use.getOperandNo() != 0) {		if (isa<ShuffleVectorInst>(Use.getUser()) && Use.getOperandNo() != 0) {
LLVM_DEBUG(dbgs()		LLVM_DEBUG(dbgs()
<< "First converging shuffle operand should be an FSub"		<< "First converging shuffle operand should be an FSub.\n");
<< ".\n");
ContinueIdentification = false;
return false;		return false;
}		}
} else if (I->getOpcode() == Instruction::FAdd) {		} else if (I->getOpcode() == Instruction::FAdd) {
if (isa<ShuffleVectorInst>(Use.getUser()) && Use.getOperandNo() != 1) {		if (isa<ShuffleVectorInst>(Use.getUser()) && Use.getOperandNo() != 1) {
LLVM_DEBUG(dbgs()		LLVM_DEBUG(dbgs()
<< "Second converging shuffle operand should be an FAdd"		<< "Second converging shuffle operand should be an FAdd.\n");
<< ".\n");
return false;		return false;
}		}
}		}
}		}

auto Pattern = m_BinOp(m_Shuffle(m_Value(), m_Value()),
m_Shuffle(m_Value(), m_Value()));
CN->IsTopLevel = match(CN->OriginalInput0, Pattern) &&
match(CN->OriginalInput1, Pattern);
CN->UsesNegation = ContainsNeg;		CN->UsesNegation = ContainsNeg;
CN->OutputNode = I;		CN->OutputNode = I;

CN->Rotation = (I->getOpcode() == Instruction::FAdd) * 90;		CN->Rotation = (I->getOpcode() == Instruction::FAdd) * 90;

if (I->getOpcode() == Instruction::FSub) {		if (I->getOpcode() == Instruction::FSub) {
auto *SubOp0 = cast<Instruction>(I->getOperand(0));		auto *SubOp0 = cast<Instruction>(I->getOperand(0));
auto SubOp0C0 = getOperatingComponentOfValue(SubOp0->getOperand(0));		auto SubOp0C0 = getOperatingComponentOfValue(SubOp0->getOperand(0));
▲ Show 20 Lines • Show All 64 Lines • ▼ Show 20 Lines	if (SharedOp) {
"operand index is greater than 1.\n");		"operand index is greater than 1.\n");
return false;		return false;
}		}
CN->addInstruction(SharedOp);		CN->addInstruction(SharedOp);
CN->UsesNegation = true;		CN->UsesNegation = true;
}		}
}		}

auto Pattern = m_BinOp(m_Shuffle(m_Value(), m_Value()),
m_Shuffle(m_Value(), m_Value()));
CN->IsTopLevel = match(CN->OriginalInput0, Pattern) &&
match(CN->OriginalInput1, Pattern);
CN->OutputNode = J;		CN->OutputNode = J;
submitCompositeNode(CN);		submitCompositeNode(CN);
return true;		return true;
}		}
ContinueIdentification = true;		ContinueIdentification = true;
return false;		return false;
}		}

▲ Show 20 Lines • Show All 178 Lines • ▼ Show 20 Lines	if (haveSharedUses(CN->OutputNode, PrevCN->OutputNode)) {
CN->Accumulator = PrevCN->OutputNode;		CN->Accumulator = PrevCN->OutputNode;
PrevCN->Accumulatee = CN->OutputNode;		PrevCN->Accumulatee = CN->OutputNode;
}		}
}		}

return true;		return true;
}		}

		void ComplexDeinterleavingGraph::splitNodes(const TargetLowering *TL) {
		unsigned Cap = CompositeNodes.size();
		for (unsigned Idx = 0; Idx < Cap; Idx++) {
		auto Item = CompositeNodes[Idx];
		auto NewNode = cloneCompositeNode(Item);

		Item->SetSplit(0);
		NewNode->SetSplit(1);

		submitCompositeNode(NewNode);
		}
		}

		Value *ComplexDeinterleavingGraph::splitLoadIfNecessary(
		ComplexDeinterleavingGraph::NodePtr Node, Value *V) {
		if (!V)
		return V;

		auto *I = dyn_cast<Instruction>(V);
		if (!I)
		return V;

		if (!isa<LoadInst>(I) \|\| !Node->HasSplit)
		return I;

		IRBuilder<> B(I);

		auto *Ty = I->getType();
		auto *VTy = dyn_cast<FixedVectorType>(Ty);
		if (!VTy)
		return I;

		unsigned Size = VTy->getNumElements() / 2;
		SmallVector<int> Mask = createArrayWithStep(Size, 1, Size * Node->SplitIdx);

		Value *Shuffle = nullptr;

		auto It = ShuffleMapping.find(I);

		if (It != ShuffleMapping.end()) {
		Shuffle = (*It).second[Node->SplitIdx];
		}

		if (Shuffle == nullptr) {
		Shuffle = B.CreateShuffleVector(I, Mask);
		cast<Instruction>(Shuffle)->moveAfter(I);
		LLVM_DEBUG(dbgs() << "Creating new shuffle:"; Shuffle->dump());

		if (It == ShuffleMapping.end()) {
		auto P = ShuffleMapping.emplace(I, new Value *[2]);
		It = P.first;
		(*It).second[0] = nullptr;
		(*It).second[1] = nullptr;
		}
		(*It).second[Node->SplitIdx] = Shuffle;
		} else {

		LLVM_DEBUG(dbgs() << "Reusing shuffle:"; Shuffle->dump());
		}

		return Shuffle;
		}

bool ComplexDeinterleavingGraph::replaceNodes(const TargetLowering *TL) {		bool ComplexDeinterleavingGraph::replaceNodes(const TargetLowering *TL) {
if (CompositeNodes.empty())		if (CompositeNodes.empty())
return false;		return false;

		if (NeedsSplit)
		splitNodes(TL);

unsigned GeneratedIntrinsics = 0;		unsigned GeneratedIntrinsics = 0;
auto *ConvergingI = Instructions[0];		auto *ConvergingI = Instructions[0];

auto TTI = TL->getTargetMachine().getTargetTransformInfo(		auto TTI = TL->getTargetMachine().getTargetTransformInfo(
*ConvergingI->getFunction());		*ConvergingI->getFunction());
for (const auto &CN : CompositeNodes) {		for (const auto &CN : CompositeNodes) {
auto *N = cast<Instruction>(CN->OutputNode);		auto *N = cast<Instruction>(CN->OutputNode);

// Wrangle the inputs		// Wrangle the inputs

/// If the given value is part of a CompositeNode, and said node is part of		/// If the given value is part of a CompositeNode, and said node is part of
/// an accumulator chain, return the accumulator. Otherwise, returns the		/// an accumulator chain, return the accumulator. Otherwise, returns the
/// "best fit" value (the ReplacementNode of a containing CompositeNode, or		/// "best fit" value (the ReplacementNode of a containing CompositeNode, or
/// the value itself)		/// the value itself)
auto FollowAccumulatorIfNecessary = [&](Value V) -> Value {		auto FollowAccumulatorIfNecessary = [&](NodePtr Node, Value V) -> Value {
		LLVM_DEBUG(dbgs() << "FollowAccumulatorIfNecessary"
		<< ".\n");
auto *I = dyn_cast<Instruction>(V);		auto *I = dyn_cast<Instruction>(V);
if (!I)		if (!I)
return V;		return V;

auto CN = getContainingComposite(I);		auto CN = getContainingComposite(I, Node->SplitIdx);
if (!CN)		if (!CN)
return I;		return I;

if (CN->Accumulatee)		if (CN->Accumulatee)
CN = getContainingComposite(cast<Instruction>(CN->Accumulatee));		CN = getContainingComposite(cast<Instruction>(CN->Accumulatee),
		CN->SplitIdx);

return CN->ReplacementNode;		return CN->ReplacementNode;
};		};

/// Given a value and an operand index, get said operand and return it.		/// Given a value and an operand index, get said operand and return it.
/// If the discovered operand is part of a composite node, return the		/// If the discovered operand is part of a composite node, return the
/// replacement instead.		/// replacement instead.
auto GetInputFromOriginalInput = [&](Value *OriginalInput,		auto GetInputFromOriginalInput = [&](NodePtr Node, Value *OriginalInput,
unsigned OpIdx) -> Value * {		unsigned OpIdx) -> Value * {
auto *OriginalI = cast<Instruction>(OriginalInput);		auto *OriginalI = cast<Instruction>(OriginalInput);
if (OriginalI->getOpcode() == Instruction::FNeg)		if (OriginalI->getOpcode() == Instruction::FNeg)
OpIdx = 0;		OpIdx = 0;

auto *Op = OriginalI->getOperand(OpIdx);		auto *Op = OriginalI->getOperand(OpIdx);
if (auto *SVI = dyn_cast<ShuffleVectorInst>(Op))		if (auto *SVI = dyn_cast<ShuffleVectorInst>(Op))
Op = SVI->getOperand(0);		Op = SVI->getOperand(0);

if (!Op)		if (!Op)
return nullptr;		return nullptr;

if (auto *I = dyn_cast<Instruction>(Op)) {		if (auto *I = dyn_cast<Instruction>(Op)) {
if (auto Containing = getContainingComposite(I)) {		if (auto Containing = getContainingComposite(I, Node->SplitIdx)) {
if (Containing->ReplacementNode)		if (Containing->ReplacementNode)
return Containing->ReplacementNode;		return Containing->ReplacementNode;
}		}
}		}
return Op;		return Op;
};		};

if (CN->Operation == llvm::ComplexDeinterleavingOperation::CAdd) {		if (CN->Operation == llvm::ComplexDeinterleavingOperation::CAdd) {
Value *Sub = nullptr;		Value *Sub = nullptr;
if (auto *Op0 = dyn_cast<Instruction>(CN->OriginalInput0)) {		if (auto *Op0 = dyn_cast<Instruction>(CN->OriginalInput0)) {
if (Op0->getOpcode() == Instruction::FSub)		if (Op0->getOpcode() == Instruction::FSub)
Sub = Op0;		Sub = Op0;
}		}
if (!Sub) {		if (!Sub) {
if (auto *Op1 = dyn_cast<Instruction>(CN->OriginalInput1)) {		if (auto *Op1 = dyn_cast<Instruction>(CN->OriginalInput1)) {
if (Op1->getOpcode() == Instruction::FSub)		if (Op1->getOpcode() == Instruction::FSub)
Sub = Op1;		Sub = Op1;
}		}
}		}

if (!Sub)		if (!Sub)
return false;		return false;

CN->Input0 =		CN->Input0 = FollowAccumulatorIfNecessary(
FollowAccumulatorIfNecessary(GetInputFromOriginalInput(Sub, 0));		CN, GetInputFromOriginalInput(CN, Sub, 0));
CN->Input1 =		CN->Input1 = FollowAccumulatorIfNecessary(
FollowAccumulatorIfNecessary(GetInputFromOriginalInput(Sub, 1));		CN, GetInputFromOriginalInput(CN, Sub, 1));
} else {		} else {
CN->Input0 = FollowAccumulatorIfNecessary(		CN->Input0 = FollowAccumulatorIfNecessary(
GetInputFromOriginalInput(CN->OriginalInput0, 0));		CN, GetInputFromOriginalInput(CN, CN->OriginalInput0, 0));
CN->Input1 = FollowAccumulatorIfNecessary(		CN->Input1 = FollowAccumulatorIfNecessary(
GetInputFromOriginalInput(CN->OriginalInput1, 0));		CN, GetInputFromOriginalInput(CN, CN->OriginalInput1, 0));

if (CN->OriginalInput0 != CN->OriginalInput1 && CN->Input0 == CN->Input1)		if (CN->OriginalInput0 != CN->OriginalInput1 && CN->Input0 == CN->Input1)
CN->Input1 = FollowAccumulatorIfNecessary(		CN->Input1 = FollowAccumulatorIfNecessary(
GetInputFromOriginalInput(CN->OriginalInput1, 1));		CN, GetInputFromOriginalInput(CN, CN->OriginalInput1, 1));
}		}

if (CN->Input0 == nullptr \|\| CN->Input1 == nullptr)		if (CN->Input0 == nullptr \|\| CN->Input1 == nullptr)
continue;		continue;

		LLVM_DEBUG(dbgs() << "Splitting loads if necessary"
		<< ".\n");
		CN->Input0 = splitLoadIfNecessary(CN, CN->Input0);
		CN->Input1 = splitLoadIfNecessary(CN, CN->Input1);

if (CN->Accumulator) {		if (CN->Accumulator) {
if (auto Node =		if (auto Node = getContainingComposite(cast<Instruction>(CN->Accumulator),
getContainingComposite(cast<Instruction>(CN->Accumulator)))		CN->SplitIdx))
CN->Accumulator = cast<Instruction>(Node->ReplacementNode);		CN->Accumulator = cast<Instruction>(Node->ReplacementNode);
}		}

if (CN->Operation == llvm::ComplexDeinterleavingOperation::CMulPartial &&		if (CN->Operation == llvm::ComplexDeinterleavingOperation::CMulPartial &&
CN->Accumulator) {		CN->Accumulator) {
if (auto Node =		if (auto Node = getContainingComposite(cast<Instruction>(CN->Accumulator),
getContainingComposite(cast<Instruction>(CN->Accumulator))) {		CN->SplitIdx)) {
bool Valid90 = (Node->Rotation == 0 && CN->Rotation == 90) \|\|		bool Valid90 = (Node->Rotation == 0 && CN->Rotation == 90) \|\|
(Node->Rotation == 90 && CN->Rotation == 0);		(Node->Rotation == 90 && CN->Rotation == 0);
bool Valid270 = (Node->Rotation == 180 && CN->Rotation == 270) \|\|		bool Valid270 = (Node->Rotation == 180 && CN->Rotation == 270) \|\|
(Node->Rotation == 270 && CN->Rotation == 180);		(Node->Rotation == 270 && CN->Rotation == 180);
if (!Valid90 && !Valid270) {		if (!Valid90 && !Valid270) {
LLVM_DEBUG(dbgs() << "Invalid rotation pairs.\n");		LLVM_DEBUG(dbgs() << "Invalid rotation pairs.\n");
return false;		return false;
}		}

CN->Input0 = Node->Input0;		CN->Input0 = Node->Input0;
CN->Input1 = Node->Input1;		CN->Input1 = Node->Input1;
}		}
}		}

CN->ReplacementNode = TL->createComplexDeinterleavingIR(		CN->ReplacementNode = TL->createComplexDeinterleavingIR(
N, CN->Operation, CN->Rotation, CN->Input0, CN->Input1,		N, CN->Operation, CN->Rotation, CN->Input0, CN->Input1,
CN->Accumulator);		CN->Accumulator);
if (!CN->ReplacementNode) {		assert(CN->ReplacementNode \|\| "Target failed to create Intrinsic call.");
LLVM_DEBUG(dbgs() << "Target failed to create Intrinsic call.\n");
return false;
}

cast<Instruction>(CN->ReplacementNode)		cast<Instruction>(CN->ReplacementNode)
->moveAfter(cast<Instruction>(CN->OutputNode));		->moveAfter(cast<Instruction>(CN->OutputNode));

CostOfIntrinsics += TTI.getInstructionCost(		CostOfIntrinsics += TTI.getInstructionCost(
cast<Instruction>(CN->ReplacementNode), CostKind);		cast<Instruction>(CN->ReplacementNode), CostKind);
GeneratedIntrinsics += 1;		GeneratedIntrinsics += 1;
}		}

		Value *R = nullptr;
		if (NeedsSplit) {
		auto *R0 = getFinalInputReplacement(ConvergingI, 0);
		auto *R1 = getFinalInputReplacement(ConvergingI, 1);

		unsigned Size = 8;

		auto Mask = createArrayWithStep(Size, 1);

		IRBuilder<> B(ConvergingI);
		R = B.CreateShuffleVector(R0, R1, Mask);

		} else {
auto *R = getFinalInputReplacement(ConvergingI);		auto *R = getFinalInputReplacement(ConvergingI);
if (!R) {		if (!R) {
LLVM_DEBUG(dbgs() << "Unable to find Final Input Replacement.\n");		LLVM_DEBUG(dbgs() << "Unable to find Final Input Replacement.\n");
return false;		return false;
}		}
		}

InstructionCost CostOfNodes;		InstructionCost CostOfNodes;
for (const auto &I : Instructions)		for (const auto &I : Instructions)
CostOfNodes += TTI.getInstructionCost(I, CostKind);		CostOfNodes += TTI.getInstructionCost(I, CostKind);

LLVM_DEBUG(dbgs() << "Evaluating cost of each graph. Instructions: "		LLVM_DEBUG(dbgs() << "Evaluating cost of each graph. Instructions: "
<< CostOfNodes << ", Intrinsics: " << CostOfIntrinsics		<< CostOfNodes << ", Intrinsics: " << CostOfIntrinsics
<< ".\n");		<< ".\n");
Show All 19 Lines