diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h --- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -38,7 +38,9 @@ // The following 'operations' are used to represent internal states. Backends // are not expected to try and support these in any capacity. Deinterleave, - Symmetric + Symmetric, + ReductionPHI, + ReductionOperation, }; enum class ComplexDeinterleavingRotation { diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -228,8 +228,53 @@ /// Topologically sorted root instructions SmallVector OrderedRoots; + /// When examining a basic block for complex deinterleaving, if it is a simple + /// one-block loop, then the only incoming block is 'Incoming' and the + /// 'BackEdge' block is the block itself." + BasicBlock *BackEdge = nullptr; + BasicBlock *Incoming = nullptr; + + /// ReductionInfo maps from %ReductionOp to %PHInode and Instruction + /// %OutsideUser as it is shown in the IR: + /// + /// vector.body: + /// %PHInode = phi [ zeroinitializer, %entry ], + /// [ %ReductionOp, %vector.body ] + /// ... + /// %ReductionOp = fadd i64 ... + /// ... + /// br i1 %condition, label %vector.body, %middle.block + /// + /// middle.block: + /// %OutsideUser = llvm.vector.reduce.fadd(..., %ReductionOp) + /// + /// %OutsideUser can be `llvm.vector.reduce.fadd` or `fadd` preceding + /// `llvm.vector.reduce.fadd` when unroll factor isn't one. + std::map> ReductionInfo; + + /// In the process of detecting a reduction, we consider a pair of + /// %ReductionOP, which we refer to as real and imag (or vice versa), and + /// traverse the use-tree to detect complex operations. As this is a reduction + /// operation, it will eventually reach RealPHI and ImagPHI, which corresponds + /// to the %ReductionOPs that we suspect to be complex. + /// RealPHI and ImagPHI are used by the identifyPHINode method. + PHINode *RealPHI = nullptr; + PHINode *ImagPHI = nullptr; + + /// OldToNewPHI maps the original real PHINode to a new, double-sized PHINode. + /// The new PHINode corresponds to a vector of deinterleaved complex numbers. + /// This mapping is populated during + /// ComplexDeinterleavingOperation::ReductionPHI node replacement. It is then + /// used in the ComplexDeinterleavingOperation::ReductionOperation node + /// replacement process. + std::map OldToNewPHI; + NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation, Instruction *R, Instruction *I) { + assert(((Operation != ComplexDeinterleavingOperation::ReductionPHI && + Operation != ComplexDeinterleavingOperation::ReductionOperation) || + (R && I)) && + "Reduction related nodes must have Real and Imaginary parts"); return std::make_shared(Operation, R, I); } @@ -324,6 +369,8 @@ /// intrinsic (for both fixed and scalable vectors) NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag); + NodePtr identifyPHINode(Instruction *Real, Instruction *Imag); + Value *replaceNode(IRBuilderBase &Builder, RawNodePtr Node); public: @@ -337,6 +384,10 @@ /// current graph. bool identifyNodes(Instruction *RootI); + bool collectPotentialReductions(BasicBlock *B); + + void identifyReductionNodes(); + /// Check that every instruction, from the roots to the leaves, has internal /// uses. bool checkNodes(); @@ -439,6 +490,9 @@ bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) { ComplexDeinterleavingGraph Graph(TL, TLI); + if (Graph.collectPotentialReductions(B)) + Graph.identifyReductionNodes(); + for (auto &I : *B) Graph.identifyNodes(&I); @@ -822,6 +876,9 @@ if (NodePtr CN = identifyDeinterleave(Real, Imag)) return CN; + if (NodePtr CN = identifyPHINode(Real, Imag)) + return CN; + auto *VTy = cast(Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); @@ -1293,6 +1350,16 @@ } bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { + // This potential root instruction might already have been recognized as + // reduction. Because RootToNode maps both Real and Imaginary parts to + // CompositeNode we should choose only one either Real or Imag instruction to + // use as an anchor for generating complex instruction. + auto It = RootToNode.find(RootI); + if (It != RootToNode.end() && It->second->Real == RootI) { + OrderedRoots.push_back(RootI); + return true; + } + auto RootNode = identifyRoot(RootI); if (!RootNode) return false; @@ -1310,6 +1377,106 @@ return true; } +bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) { + bool FoundPotentialReduction = false; + + auto *Br = dyn_cast(B->getTerminator()); + if (!Br || Br->getNumSuccessors() != 2) + return false; + + // Identify simple one-block loop + if (Br->getSuccessor(0) != B && Br->getSuccessor(1) != B) + return false; + + SmallVector PHIs; + for (auto &PHI : B->phis()) { + if (PHI.getNumIncomingValues() != 2) + continue; + + if (!PHI.hasOneUser()) + continue; + + if (!PHI.getType()->isVectorTy()) + continue; + + auto *ReductionOp = dyn_cast(PHI.getIncomingValueForBlock(B)); + if (!ReductionOp) + continue; + + // Check if final instruction is reduced outside of current block + Instruction *FinalReduction = nullptr; + auto NumUsers = 0u; + for (auto *U : ReductionOp->users()) { + ++NumUsers; + if (U == &PHI) + continue; + FinalReduction = dyn_cast(U); + } + + if (NumUsers != 2 || !FinalReduction || FinalReduction->getParent() == B) + continue; + + ReductionInfo[ReductionOp] = {&PHI, FinalReduction}; + FinalInstructions.insert(&PHI); + + BackEdge = B; + auto BackEdgeIdx = PHI.getBasicBlockIndex(B); + auto IncomingIdx = BackEdgeIdx == 0 ? 1 : 0; + Incoming = PHI.getIncomingBlock(IncomingIdx); + FoundPotentialReduction = true; + } + return FoundPotentialReduction; +} + +void ComplexDeinterleavingGraph::identifyReductionNodes() { + SmallVector Processed(ReductionInfo.size(), false); + SmallVector OperationInstruction; + for (auto &P : ReductionInfo) + OperationInstruction.push_back(P.first); + + // Identify a complex computation by evaluating two reduction operations that + // potentially could be involved + for (size_t i = 0; i < OperationInstruction.size(); ++i) { + if (Processed[i]) + continue; + for (size_t j = i + 1; j < OperationInstruction.size(); ++j) { + if (Processed[j]) + continue; + + auto *Real = OperationInstruction[i]; + auto *Imag = OperationInstruction[j]; + + RealPHI = ReductionInfo[Real].first; + ImagPHI = ReductionInfo[Imag].first; + auto Node = identifyNode(Real, Imag); + if (!Node) { + std::swap(Real, Imag); + std::swap(RealPHI, ImagPHI); + Node = identifyNode(Real, Imag); + } + + // If a node is identified, mark its operation instructions as used to + // prevent re-identification and attach the node to the real part + if (Node) { + LLVM_DEBUG(dbgs() << "Identified reduction starting from instructions: " + << *Real << " / " << *Imag << "\n"); + Processed[i] = true; + Processed[j] = true; + auto RootNode = prepareCompositeNode( + ComplexDeinterleavingOperation::ReductionOperation, Real, Imag); + RootNode->addOperand(Node); + RootToNode[Real] = RootNode; + RootToNode[Imag] = RootNode; + submitCompositeNode(RootNode); + break; + } + } + } + + RealPHI = nullptr; + ImagPHI = nullptr; +} + bool ComplexDeinterleavingGraph::checkNodes() { // Collect all instructions from roots to leaves SmallPtrSet AllInstructions; @@ -1524,6 +1691,17 @@ return submitCompositeNode(PlaceholderNode); } +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real, + Instruction *Imag) { + if (Real != RealPHI || Imag != ImagPHI) + return nullptr; + + NodePtr PlaceholderNode = prepareCompositeNode( + ComplexDeinterleavingOperation::ReductionPHI, Real, Imag); + return submitCompositeNode(PlaceholderNode); +} + static Value *replaceSymmetricNode(IRBuilderBase &B, unsigned Opcode, FastMathFlags Flags, Value *InputA, Value *InputB) { @@ -1553,6 +1731,61 @@ if (Node->ReplacementNode) return Node->ReplacementNode; + // If Operation is ReductionPHI, a new empty PHINode is created. + // It is filled later when the ReductionOperation is processed. + if (Node->Operation == ComplexDeinterleavingOperation::ReductionPHI) { + auto *VTy = cast(Node->Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); + auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHI()); + OldToNewPHI[dyn_cast(Node->Real)] = NewPHI; + Node->ReplacementNode = NewPHI; + return NewPHI; + } + + // If Operation is ReductionOperation, find the previously generated empty + // PHINode and interleave the initial values. + if (Node->Operation == ComplexDeinterleavingOperation::ReductionOperation) { + auto Result = replaceNode(Builder, Node->Operands[0]); + auto *OldPHIReal = ReductionInfo[Node->Real].first; + auto *OldPHIImag = ReductionInfo[Node->Imag].first; + auto *NewPHI = OldToNewPHI[OldPHIReal]; + + auto *VTy = cast(Node->Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); + + // We have to interleave initial origin values coming from IncomingBlock + Value *InitReal = OldPHIReal->getIncomingValueForBlock(Incoming); + Value *InitImag = OldPHIImag->getIncomingValueForBlock(Incoming); + + IRBuilder<> Builder(Incoming->getTerminator()); + auto *NewInit = + Builder.CreateIntrinsic(Intrinsic::experimental_vector_interleave2, + NewVTy, {InitReal, InitImag}); + + NewPHI->addIncoming(NewInit, Incoming); + NewPHI->addIncoming(Result, BackEdge); + + // Deinterleave complex vector outside of loop so that it can be finally + // reduced + auto *FinalReductionReal = ReductionInfo[Node->Real].second; + auto *FinalReductionImag = ReductionInfo[Node->Imag].second; + + Builder.SetInsertPoint(FinalReductionReal); + auto *Deinterleave = + Builder.CreateIntrinsic(Intrinsic::experimental_vector_deinterleave2, + Result->getType(), Result); + + auto *NewReal = Builder.CreateExtractValue(Deinterleave, (uint64_t)0); + FinalReductionReal->replaceUsesOfWith(Node->Real, NewReal); + + Builder.SetInsertPoint(FinalReductionImag); + auto *NewImag = Builder.CreateExtractValue(Deinterleave, 1); + FinalReductionImag->replaceUsesOfWith(Node->Imag, NewImag); + + Node->ReplacementNode = Result; + return Result; + } + Value *Input0 = replaceNode(Builder, Node->Operands[0]); Value *Input1 = Node->Operands.size() > 1 ? replaceNode(Builder, Node->Operands[1]) @@ -1587,9 +1820,18 @@ IRBuilder<> Builder(RootInstruction); auto RootNode = RootToNode[RootInstruction]; Value *R = replaceNode(Builder, RootNode.get()); - assert(R && "Unable to find replacement for RootInstruction"); - DeadInstrRoots.push_back(RootInstruction); - RootInstruction->replaceAllUsesWith(R); + + if (RootNode->Operation == + ComplexDeinterleavingOperation::ReductionOperation) { + ReductionInfo[RootNode->Real].first->removeIncomingValue(BackEdge); + ReductionInfo[RootNode->Imag].first->removeIncomingValue(BackEdge); + DeadInstrRoots.push_back(RootNode->Real); + DeadInstrRoots.push_back(RootNode->Imag); + } else { + assert(R && "Unable to find replacement for RootInstruction"); + DeadInstrRoots.push_back(RootInstruction); + RootInstruction->replaceAllUsesWith(R); + } } for (auto *I : DeadInstrRoots)