diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -41,6 +41,7 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -3165,7 +3166,7 @@ /// If one cannot be created using all the given inputs, nullptr should be /// returned. virtual Value *createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, + IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator = nullptr) const { return nullptr; diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -184,6 +184,12 @@ class ComplexDeinterleavingGraph { public: + struct Product { + Instruction *Multiplier; + Instruction *Multiplicand; + bool IsPositive; + }; + using Addend = std::pair; using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr; using RawNodePtr = ComplexDeinterleavingCompositeNode::RawNodePtr; explicit ComplexDeinterleavingGraph(const TargetLowering *TL, @@ -254,6 +260,45 @@ NodePtr identifyNode(Instruction *I, Instruction *J); + /// Determine if a sum of complex numbers can be formed from \p RealAddends + /// and \p ImagAddens. If \p Accumulator is not null, add the result to it. + /// Return nullptr if it is not possible to construct a complex number. + NodePtr identifyAdditions(std::list &RealAddends, + std::list &ImagAddends, + NodePtr Accumulator); + + /// Extract one addend that have both real and imaginary parts positive. + NodePtr extractPositiveAddend(std::list &RealAddends, + std::list &ImagAddends); + + /// Determine if sum of multiplications of complex numbers can be formed from + /// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result + /// to it. Return nullptr if it is not possible to construct a complex number. + NodePtr identifyMultiplications(std::list &RealMuls, + std::list &ImagMuls, + NodePtr Accumulator); + + /// This function attempts to locate four products that correspond to a single + /// complex multiplication. Upon success, it returns two NodePtrs representing + /// the complex numbers being multiplied, as well as the rotations for FCMLA + /// multiplication. Additionally, it removes the products from RealMuls and + /// ImagMuls + /// TODO: This code may not be able to detect multiplications that involve an + /// additional 90-degree rotation, however it can be implemented at a later + /// time by extending RotationMap + std::pair + extractOneMul(std::list &RealMuls, std::list &ImagMuls, + ComplexDeinterleavingRotation &Rotation1, + ComplexDeinterleavingRotation &Rotation2); + + /// If the code is compiled with -Ofast or expressions have `reassoc` flag, + /// the order of complex computation operations may be significantly altered, + /// and the real and imaginary parts may not be executed in parallel. This + /// function takes this into consideration and employs a more general approach + /// to identify complex computations. Initially, it gathers all the addends + /// and multiplicands and then constructs a complex expression from them. + NodePtr identifyReassocNodes(Instruction *I, Instruction *J); + NodePtr identifyRoot(Instruction *I); /// Identifies the Deinterleave operation applied to a vector containing @@ -265,7 +310,8 @@ /// intrinsic (for both fixed and scalable vectors) NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag); - Value *replaceNode(RawNodePtr Node); + Value *replaceNode(IRBuilderBase &IRB, + ComplexDeinterleavingGraph::RawNodePtr Node); public: void dump() { dump(dbgs()); } @@ -759,22 +805,377 @@ auto *VTy = cast(Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); - if (TL->isComplexDeinterleavingOperationSupported( - ComplexDeinterleavingOperation::CMulPartial, NewVTy) && - isInstructionPairMul(Real, Imag)) { - return identifyPartialMul(Real, Imag); + bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy); + bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CAdd, NewVTy); + + if (HasCMulSupport && isInstructionPairMul(Real, Imag)) { + Node = identifyPartialMul(Real, Imag); + if (Node) + return Node; + } + + if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) { + Node = identifyAdd(Real, Imag); + if (Node) + return Node; + } + + if (HasCMulSupport && HasCAddSupport) { + Node = identifyReassocNodes(Real, Imag); + if (Node) + return Node; + } + + Node = identifySymmetricOperation(Real, Imag); + if (Node) + return Node; + + LLVM_DEBUG(dbgs() << " - Not recognised as a valid pattern.\n"); + return nullptr; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real, + Instruction *Imag) { + if ((Real->getOpcode() != Instruction::FAdd && + Real->getOpcode() != Instruction::FSub && + Real->getOpcode() != Instruction::FNeg) || + (Imag->getOpcode() != Instruction::FAdd && + Imag->getOpcode() != Instruction::FSub && + Imag->getOpcode() != Instruction::FNeg)) + return nullptr; + + // Collect muls and non-muls + auto Collect = [](Instruction *Insn, std::list &Muls, + std::list &Addends) -> bool { + SmallVector> Worklist = {{Insn, true}}; + SmallPtrSet Visited; + while (!Worklist.empty()) { + auto [V, IsPositive] = Worklist.back(); + Worklist.pop_back(); + if (!Visited.insert(V).second) + continue; + + Instruction *I = dyn_cast(V); + if (!I) + return false; + + if (I->getOpcode() == Instruction::FAdd) { + Worklist.emplace_back(I->getOperand(1), IsPositive); + Worklist.emplace_back(I->getOperand(0), IsPositive); + } else if (I->getOpcode() == Instruction::FSub) { + Worklist.emplace_back(I->getOperand(1), IsPositive ^ true); + Worklist.emplace_back(I->getOperand(0), IsPositive); + } else if (I->getOpcode() == Instruction::FMul) { + auto *A = dyn_cast(I->getOperand(0)); + if (A && A->getOpcode() == Instruction::FNeg) { + A = dyn_cast(A->getOperand(0)); + IsPositive ^= true; + } + if (!A) + return false; + auto *B = dyn_cast(I->getOperand(1)); + if (B && B->getOpcode() == Instruction::FNeg) { + B = dyn_cast(B->getOperand(0)); + IsPositive ^= true; + } + if (!B) + return false; + Muls.push_back(Product{A, B, IsPositive}); + } else if (I->getOpcode() == Instruction::FNeg) { + Worklist.emplace_back(I->getOperand(0), IsPositive ^ true); + } else { + Addends.emplace_back(I, IsPositive); + continue; + } + + if (!I->getFastMathFlags().allowReassoc()) { + LLVM_DEBUG(dbgs() << "Reassoc is missing from the FastMath flags: " + << *I << "\n"); + return false; + } + } + return true; + }; + + std::list RealMuls, ImagMuls; + std::list RealAddends, ImagAddends; + if (!Collect(Real, RealMuls, RealAddends) || + !Collect(Imag, ImagMuls, ImagAddends)) + return nullptr; + + if (RealAddends.size() != ImagAddends.size()) + return nullptr; + + NodePtr FinalNode; + if (!RealMuls.empty() || !ImagMuls.empty()) { + // If there are multiplicands, first try to extract positive addend and use + // it as an accumulator + FinalNode = extractPositiveAddend(RealAddends, ImagAddends); + FinalNode = identifyMultiplications(RealMuls, ImagMuls, FinalNode); + if (!FinalNode) + return nullptr; + } + + if (!RealAddends.empty() || !ImagAddends.empty()) { + FinalNode = identifyAdditions(RealAddends, ImagAddends, FinalNode); + if (!FinalNode) + return nullptr; + } + FinalNode->Real = Real; + FinalNode->Imag = Imag; + submitCompositeNode(FinalNode); + return FinalNode; +} + +std::pair +ComplexDeinterleavingGraph::extractOneMul( + std::list &RealMuls, std::list &ImagMuls, + ComplexDeinterleavingRotation &Rotation1, + ComplexDeinterleavingRotation &Rotation2) { + auto ExtractCommon = [](Product &Real, Product &Imag) -> Instruction * { + if (Real.Multiplicand == Imag.Multiplicand || + Real.Multiplicand == Imag.Multiplier) + return Real.Multiplicand; + + if (Real.Multiplier == Imag.Multiplicand || + Real.Multiplier == Imag.Multiplier) + return Real.Multiplier; + + return nullptr; + }; + + // The following table illustrates the relationship between multiplications + // and rotations. If we consider the multiplication (X + iY) * (U + iV), we + // can see: + // + // Rotation | Real | Imag | + // ---------+--------+--------+ + // 0 | x * u | x * v | + // 90 | -y * v | y * u | + // 180 | -x * u | -x * v | + // 270 | y * v | -y * u | + // + // This code will attempt to identify four multiplications that share common + // operands, and can be represented by two of the expressions in the table + // above. The rotations will be determined, and it will be verified if the + // identified operands correspond to two complex numbers + NodePtr NodeA, NodeB; + Instruction *X, *Y, *U, *V; + auto ItR1 = RealMuls.begin(); + for (auto ItI1 = ImagMuls.begin(); ItI1 != ImagMuls.end(); ++ItI1) { + X = ExtractCommon(*ItR1, *ItI1); + if (!X) + continue; + + U = ItR1->Multiplicand == X ? ItR1->Multiplier : ItR1->Multiplicand; + V = ItI1->Multiplicand == X ? ItI1->Multiplier : ItI1->Multiplicand; + + for (auto ItR2 = std::next(ItR1); ItR2 != RealMuls.end(); ++ItR2) { + auto *Tmp = ExtractCommon(*ItR2, *ItI1); + if (!Tmp || Tmp != V) + continue; + + Y = ItR2->Multiplicand == V ? ItR2->Multiplier : ItR2->Multiplicand; + + for (auto ItI2 = ImagMuls.begin(); ItI2 != ImagMuls.end(); ++ItI2) { + if (ItI1 == ItI2) + continue; + + if ((ItI2->Multiplier != U && ItI2->Multiplicand != U) || + (ItI2->Multiplier != Y && ItI2->Multiplicand != Y)) + continue; + + LLVM_DEBUG({ + dbgs() << "Found potential complex multiplication:\n"; + for (auto It : {ItR1, ItR2, ItI1, ItI2}) + dbgs().indent(4) << (It->IsPositive ? "+" : "-") << *It->Multiplier + << " multiplied by " << *It->Multiplicand << "\n"; + }); + + // We detect only pairs of multiplication with 0 and 90 or 180 and 270 + static const ComplexDeinterleavingRotation RotationMap[2][2] = { + {ComplexDeinterleavingRotation::Rotation_180, + ComplexDeinterleavingRotation::Rotation_90}, + {ComplexDeinterleavingRotation::Rotation_270, + ComplexDeinterleavingRotation::Rotation_0}}; + + Rotation1 = RotationMap[ItR1->IsPositive][ItI1->IsPositive]; + Rotation2 = RotationMap[ItR2->IsPositive][ItI2->IsPositive]; + + if (Rotation1 == ComplexDeinterleavingRotation::Rotation_0 || + Rotation1 == ComplexDeinterleavingRotation::Rotation_180) { + + if (Rotation2 != ComplexDeinterleavingRotation::Rotation_90 && + Rotation2 != ComplexDeinterleavingRotation::Rotation_270) + continue; + + } else { + if (Rotation2 != ComplexDeinterleavingRotation::Rotation_0 && + Rotation2 != ComplexDeinterleavingRotation::Rotation_180) + continue; + + std::swap(X, Y); + std::swap(U, V); + } + + NodeA = identifyNode(X, Y); + if (!NodeA) + continue; + NodeB = identifyNode(U, V); + if (!NodeB) + continue; + + LLVM_DEBUG({ + dbgs() << "Identified multiplication (X, Y) * (U, V):\n"; + dbgs().indent(4) << "X: " << *X << "\n"; + dbgs().indent(4) << "Y: " << *Y << "\n"; + dbgs().indent(4) << "U: " << *U << "\n"; + dbgs().indent(4) << "V: " << *V << "\n"; + dbgs().indent(4) << "Rotation #1 - " << (int)Rotation1 * 90 + << ", Rotation #2 - " << (int)Rotation2 * 90 << "\n"; + }); + RealMuls.erase(ItR1); + RealMuls.erase(ItR2); + ImagMuls.erase(ItI1); + ImagMuls.erase(ItI2); + return std::make_pair(NodeA, NodeB); + } + } } + return std::make_pair(NodeA, NodeB); +} - if (TL->isComplexDeinterleavingOperationSupported( - ComplexDeinterleavingOperation::CAdd, NewVTy) && - isInstructionPairAdd(Real, Imag)) { - return identifyAdd(Real, Imag); +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyMultiplications( + std::list &RealMuls, std::list &ImagMuls, + NodePtr Accumulator = nullptr) { + if (RealMuls.size() % 2 || ImagMuls.size() % 2) + return nullptr; + + if (RealMuls.size() != ImagMuls.size()) + return nullptr; + + NodePtr Result = Accumulator; + while (!RealMuls.empty()) { + ComplexDeinterleavingRotation Rotation1, Rotation2; + auto [NodeA, NodeB] = + extractOneMul(RealMuls, ImagMuls, Rotation1, Rotation2); + if (!NodeA || !NodeB) + return nullptr; + + NodePtr NodeMul1 = prepareCompositeNode( + ComplexDeinterleavingOperation::CMulPartial, nullptr, nullptr); + NodeMul1->Rotation = Rotation1; + NodeMul1->addOperand(NodeA); + NodeMul1->addOperand(NodeB); + if (Result) + NodeMul1->addOperand(Result); + submitCompositeNode(NodeMul1); + + NodePtr NodeMul2 = prepareCompositeNode( + ComplexDeinterleavingOperation::CMulPartial, nullptr, nullptr); + NodeMul2->Rotation = Rotation2; + NodeMul2->addOperand(NodeA); + NodeMul2->addOperand(NodeB); + NodeMul2->addOperand(NodeMul1); + submitCompositeNode(NodeMul2); + Result = NodeMul2; } + return Result; +} - auto Symmetric = identifySymmetricOperation(Real, Imag); - LLVM_DEBUG(if (Symmetric == nullptr) dbgs() - << " - Not recognised as a valid pattern.\n"); - return Symmetric; +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyAdditions(std::list &RealAddends, + std::list &ImagAddends, + NodePtr Accumulator = nullptr) { + if (RealAddends.size() != ImagAddends.size()) + return nullptr; + + NodePtr Result; + // If we have accumulator use it as first addend + if (Accumulator) { + Result = Accumulator; + // Otherwise find an element with both positive real and imaginary parts. + } else { + Result = extractPositiveAddend(RealAddends, ImagAddends); + } + + if (!Result) + return nullptr; + + while (!RealAddends.empty()) { + auto ItR = RealAddends.begin(); + auto [R, IsPositiveR] = *ItR; + + bool FoundImag = false; + for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) { + auto [I, IsPositiveI] = *ItI; + ComplexDeinterleavingRotation Rotation; + if (IsPositiveR && IsPositiveI) + Rotation = ComplexDeinterleavingRotation::Rotation_0; + else if (!IsPositiveR && IsPositiveI) + Rotation = ComplexDeinterleavingRotation::Rotation_90; + else if (!IsPositiveR && !IsPositiveI) + Rotation = ComplexDeinterleavingRotation::Rotation_180; + else + Rotation = ComplexDeinterleavingRotation::Rotation_270; + + NodePtr AddNode; + if (Rotation == ComplexDeinterleavingRotation::Rotation_0 || + Rotation == ComplexDeinterleavingRotation::Rotation_180) { + AddNode = identifyNode(R, I); + } else { + AddNode = identifyNode(I, R); + } + if (AddNode) { + LLVM_DEBUG({ + dbgs() << "Identified addend:\n"; + dbgs().indent(4) << "X: " << *R << "\n"; + dbgs().indent(4) << "Y: " << *I << "\n"; + dbgs().indent(4) << "Rotation - " << (int)Rotation << "\n"; + }); + + NodePtr TmpNode = prepareCompositeNode( + ComplexDeinterleavingOperation::CAdd, nullptr, nullptr); + TmpNode->Rotation = Rotation; + TmpNode->addOperand(Result); + TmpNode->addOperand(AddNode); + submitCompositeNode(TmpNode); + Result = TmpNode; + RealAddends.erase(ItR); + ImagAddends.erase(ItI); + FoundImag = true; + break; + } + } + if (!FoundImag) + return nullptr; + } + return Result; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::extractPositiveAddend( + std::list &RealAddends, std::list &ImagAddends) { + for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) { + for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) { + auto [R, IsPositiveR] = *ItR; + auto [I, IsPositiveI] = *ItI; + if (IsPositiveR && IsPositiveI) { + auto Result = identifyNode(R, I); + if (Result) { + RealAddends.erase(ItR); + ImagAddends.erase(ItI); + return Result; + } + } + } + } + return nullptr; } bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { @@ -1010,7 +1411,8 @@ return submitCompositeNode(PlaceholderNode); } -static Value *replaceSymmetricNode(ComplexDeinterleavingGraph::RawNodePtr Node, +static Value *replaceSymmetricNode(IRBuilderBase &B, + ComplexDeinterleavingGraph::RawNodePtr Node, Value *InputA, Value *InputB) { Instruction *I = Node->Real; if (I->isUnaryOp()) @@ -1020,8 +1422,6 @@ assert(InputB && "Binary symmetric operations need two inputs, only one " "was provided."); - IRBuilder<> B(I); - switch (I->getOpcode()) { case Instruction::FNeg: return B.CreateFNegFMF(InputA, I); @@ -1037,26 +1437,25 @@ } Value *ComplexDeinterleavingGraph::replaceNode( - ComplexDeinterleavingGraph::RawNodePtr Node) { + IRBuilderBase &IRB, ComplexDeinterleavingGraph::RawNodePtr Node) { if (Node->ReplacementNode) return Node->ReplacementNode; - Value *Input0 = replaceNode(Node->Operands[0]); + Value *Input0 = replaceNode(IRB, Node->Operands[0]); Value *Input1 = - Node->Operands.size() > 1 ? replaceNode(Node->Operands[1]) : nullptr; + Node->Operands.size() > 1 ? replaceNode(IRB, Node->Operands[1]) : nullptr; Value *Accumulator = - Node->Operands.size() > 2 ? replaceNode(Node->Operands[2]) : nullptr; + Node->Operands.size() > 2 ? replaceNode(IRB, Node->Operands[2]) : nullptr; if (Input1) assert(Input0->getType() == Input1->getType() && "Node inputs need to be of the same type"); if (Node->Operation == ComplexDeinterleavingOperation::Symmetric) - Node->ReplacementNode = replaceSymmetricNode(Node, Input0, Input1); + Node->ReplacementNode = replaceSymmetricNode(IRB, Node, Input0, Input1); else Node->ReplacementNode = TL->createComplexDeinterleavingIR( - Node->Real, Node->Operation, Node->Rotation, Input0, Input1, - Accumulator); + IRB, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); assert(Node->ReplacementNode && "Target failed to create Intrinsic call."); NumComplexTransformations += 1; @@ -1073,7 +1472,7 @@ IRBuilder<> Builder(RootInstruction); auto RootNode = RootToNode[RootInstruction]; - Value *R = replaceNode(RootNode.get()); + Value *R = replaceNode(Builder, RootNode.get()); assert(R && "Unable to find replacement for RootInstruction"); DeadInstrRoots.push_back(RootInstruction); RootInstruction->replaceAllUsesWith(R); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -21,6 +21,7 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" namespace llvm { @@ -838,7 +839,7 @@ ComplexDeinterleavingOperation Operation, Type *Ty) const override; Value *createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, + IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator = nullptr) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -24646,14 +24646,12 @@ } Value *AArch64TargetLowering::createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, + IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { VectorType *Ty = cast(InputA->getType()); bool IsScalable = Ty->isScalableTy(); - IRBuilder<> B(I); - unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue(); @@ -24677,9 +24675,9 @@ B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride)); } auto *LowerSplitInt = createComplexDeinterleavingIR( - I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( - I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt, B.getInt64(0)); @@ -24708,14 +24706,16 @@ } if (OperationType == ComplexDeinterleavingOperation::CAdd) { + if (Rotation == ComplexDeinterleavingRotation::Rotation_0) { + return B.CreateFAdd(InputA, InputB); + } else if (Rotation == ComplexDeinterleavingRotation::Rotation_180) { + return B.CreateFSub(InputA, InputB); + } if (IsScalable) { auto *Mask = B.CreateVectorSplat(Ty->getElementCount(), B.getInt1(true)); - if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || - Rotation == ComplexDeinterleavingRotation::Rotation_270) - return B.CreateIntrinsic( - Intrinsic::aarch64_sve_fcadd, Ty, - {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)}); - return nullptr; + return B.CreateIntrinsic( + Intrinsic::aarch64_sve_fcadd, Ty, + {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)}); } Intrinsic::ID IntId = Intrinsic::not_intrinsic; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -747,7 +747,7 @@ ComplexDeinterleavingOperation Operation, Type *Ty) const override; Value *createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, + IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator = nullptr) const override; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -22049,14 +22049,12 @@ } Value *ARMTargetLowering::createComplexDeinterleavingIR( - Instruction *I, ComplexDeinterleavingOperation OperationType, + IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { FixedVectorType *Ty = cast(InputA->getType()); - IRBuilder<> B(I); - unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits"); @@ -22081,9 +22079,9 @@ } auto *LowerSplitInt = createComplexDeinterleavingIR( - I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( - I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); ArrayRef JoinMask(&SplitSeqVec[0], Ty->getNumElements()); return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); @@ -22103,6 +22101,12 @@ } if (OperationType == ComplexDeinterleavingOperation::CAdd) { + if (Rotation == ComplexDeinterleavingRotation::Rotation_0) { + return B.CreateFAdd(InputA, InputB); + } else if (Rotation == ComplexDeinterleavingRotation::Rotation_180) { + return B.CreateFSub(InputA, InputB); + } + // 1 means the value is not halved. auto *ConstHalving = ConstantInt::get(IntTy, 1); diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll @@ -7,18 +7,12 @@ define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c) { ; CHECK-LABEL: mull_add: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip2 v6.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v7.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v4.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmla v6.2d, v0.2d, v4.2d -; CHECK-NEXT: fmla v1.2d, v7.2d, v4.2d -; CHECK-NEXT: fmla v6.2d, v7.2d, v2.2d -; CHECK-NEXT: fmls v1.2d, v0.2d, v2.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v6.2d -; CHECK-NEXT: zip2 v1.2d, v1.2d, v6.2d +; CHECK-NEXT: fcmla v4.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v5.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v4.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcmla v5.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: mov v0.16b, v4.16b +; CHECK-NEXT: mov v1.16b, v5.16b ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> @@ -43,25 +37,18 @@ define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { ; CHECK-LABEL: mul_add_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip1 v16.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v17.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v2.2d, v4.2d, v5.2d -; CHECK-NEXT: zip2 v3.2d, v4.2d, v5.2d -; CHECK-NEXT: fmul v4.2d, v16.2d, v0.2d -; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d -; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d -; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d -; CHECK-NEXT: fmul v7.2d, v16.2d, v17.2d -; CHECK-NEXT: fmla v4.2d, v17.2d, v1.2d -; CHECK-NEXT: fmla v0.2d, v3.2d, v6.2d -; CHECK-NEXT: fmla v7.2d, v2.2d, v5.2d -; CHECK-NEXT: fmla v4.2d, v3.2d, v5.2d -; CHECK-NEXT: fsub v1.2d, v7.2d, v0.2d -; CHECK-NEXT: fmla v4.2d, v2.2d, v6.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v4.2d -; CHECK-NEXT: zip2 v1.2d, v1.2d, v4.2d +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #90 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> @@ -94,26 +81,18 @@ define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { ; CHECK-LABEL: mul_sub_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v2.2d, v4.2d, v5.2d -; CHECK-NEXT: zip1 v3.2d, v6.2d, v7.2d -; CHECK-NEXT: zip1 v16.2d, v4.2d, v5.2d -; CHECK-NEXT: fmul v4.2d, v17.2d, v0.2d -; CHECK-NEXT: fmul v5.2d, v17.2d, v18.2d -; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d -; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d -; CHECK-NEXT: fmul v7.2d, v3.2d, v2.2d -; CHECK-NEXT: fmla v4.2d, v18.2d, v1.2d -; CHECK-NEXT: fmla v0.2d, v16.2d, v3.2d -; CHECK-NEXT: fmla v5.2d, v2.2d, v6.2d -; CHECK-NEXT: fmla v7.2d, v16.2d, v6.2d -; CHECK-NEXT: fsub v1.2d, v5.2d, v0.2d -; CHECK-NEXT: fsub v2.2d, v4.2d, v7.2d -; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d -; CHECK-NEXT: zip2 v1.2d, v1.2d, v2.2d +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #270 +; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #270 +; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #180 +; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #180 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> @@ -146,25 +125,18 @@ define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { ; CHECK-LABEL: mul_conj_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: zip2 v16.2d, v2.2d, v3.2d -; CHECK-NEXT: zip2 v17.2d, v0.2d, v1.2d -; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d -; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d -; CHECK-NEXT: fmul v3.2d, v16.2d, v17.2d -; CHECK-NEXT: fmul v1.2d, v2.2d, v17.2d -; CHECK-NEXT: zip1 v17.2d, v4.2d, v5.2d -; CHECK-NEXT: zip2 v4.2d, v4.2d, v5.2d -; CHECK-NEXT: fneg v3.2d, v3.2d -; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d -; CHECK-NEXT: fmla v1.2d, v0.2d, v16.2d -; CHECK-NEXT: fmla v3.2d, v0.2d, v2.2d -; CHECK-NEXT: zip2 v0.2d, v6.2d, v7.2d -; CHECK-NEXT: fmls v1.2d, v4.2d, v5.2d -; CHECK-NEXT: fmla v3.2d, v17.2d, v5.2d -; CHECK-NEXT: fmla v1.2d, v17.2d, v0.2d -; CHECK-NEXT: fmla v3.2d, v4.2d, v0.2d -; CHECK-NEXT: zip1 v0.2d, v3.2d, v1.2d -; CHECK-NEXT: zip2 v1.2d, v3.2d, v1.2d +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #90 +; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #90 +; CHECK-NEXT: fcmla v16.2d, v6.2d, v4.2d, #0 +; CHECK-NEXT: fcmla v17.2d, v7.2d, v5.2d, #0 +; CHECK-NEXT: fcmla v16.2d, v6.2d, v4.2d, #270 +; CHECK-NEXT: fcmla v17.2d, v7.2d, v5.2d, #270 +; CHECK-NEXT: mov v0.16b, v16.16b +; CHECK-NEXT: mov v1.16b, v17.16b ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll @@ -6,21 +6,13 @@ define @mull_add( %a, %b, %c) { ; CHECK-LABEL: mull_add: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 z6.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z7.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z1.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmla z1.d, p0/m, z4.d, z7.d -; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d -; CHECK-NEXT: movprfx z5, z6 -; CHECK-NEXT: fmla z5.d, p0/m, z4.d, z0.d -; CHECK-NEXT: movprfx z3, z5 -; CHECK-NEXT: fmla z3.d, p0/m, z2.d, z7.d -; CHECK-NEXT: fmls z1.d, p0/m, z2.d, z0.d -; CHECK-NEXT: zip1 z0.d, z1.d, z3.d -; CHECK-NEXT: zip2 z1.d, z1.d, z3.d +; CHECK-NEXT: fcmla z4.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z4.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: mov z0.d, z4.d +; CHECK-NEXT: mov z1.d, z5.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -47,26 +39,19 @@ define @mul_add_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_add_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d -; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d -; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: mov z24.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d -; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z26.d, z6.d, z7.d -; CHECK-NEXT: fmul z1.d, z1.d, z25.d -; CHECK-NEXT: fmul z0.d, z24.d, z0.d -; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d -; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d -; CHECK-NEXT: fmla z1.d, p0/m, z26.d, z4.d -; CHECK-NEXT: fmla z2.d, p0/m, z26.d, z3.d -; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z3.d -; CHECK-NEXT: fmla z2.d, p0/m, z5.d, z4.d -; CHECK-NEXT: fsub z1.d, z1.d, z0.d -; CHECK-NEXT: zip1 z0.d, z1.d, z2.d -; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #90 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -102,27 +87,19 @@ define @mul_sub_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_sub_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: mov z24.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d -; CHECK-NEXT: fmul z2.d, z1.d, z0.d -; CHECK-NEXT: fmul z1.d, z1.d, z25.d -; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d -; CHECK-NEXT: uzp2 z6.d, z6.d, z7.d -; CHECK-NEXT: fmul z0.d, z24.d, z0.d -; CHECK-NEXT: fmla z1.d, p0/m, z6.d, z3.d -; CHECK-NEXT: fmul z3.d, z5.d, z3.d -; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d -; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z4.d -; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d -; CHECK-NEXT: fsub z1.d, z1.d, z0.d -; CHECK-NEXT: fsub z2.d, z2.d, z3.d -; CHECK-NEXT: zip1 z0.d, z1.d, z2.d -; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #270 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #270 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #180 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #180 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -158,26 +135,19 @@ define @mul_conj_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_conj_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d -; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d -; CHECK-NEXT: fmul z2.d, z1.d, z0.d +; CHECK-NEXT: mov z24.d, #0 // =0x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmul z0.d, z24.d, z0.d -; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d -; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d -; CHECK-NEXT: fnmls z0.d, p0/m, z1.d, z25.d -; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fmls z1.d, p0/m, z5.d, z3.d -; CHECK-NEXT: uzp2 z2.d, z6.d, z7.d -; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z4.d -; CHECK-NEXT: fmad z3.d, p0/m, z2.d, z0.d -; CHECK-NEXT: zip1 z0.d, z3.d, z1.d -; CHECK-NEXT: zip2 z1.d, z3.d, z1.d +; CHECK-NEXT: mov z25.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270 +; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #270 +; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll @@ -220,11 +220,11 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: movi v4.2d, #0000000000000000 -; CHECK-NEXT: fcmla v3.4s, v1.4s, v0.4s, #0 -; CHECK-NEXT: fcmla v4.4s, v2.4s, v0.4s, #0 -; CHECK-NEXT: fcmla v3.4s, v1.4s, v0.4s, #90 -; CHECK-NEXT: fcmla v4.4s, v2.4s, v0.4s, #90 -; CHECK-NEXT: fcadd v0.4s, v4.4s, v3.4s, #90 +; CHECK-NEXT: fcmla v3.4s, v2.4s, v0.4s, #0 +; CHECK-NEXT: fcmla v4.4s, v1.4s, v0.4s, #0 +; CHECK-NEXT: fcmla v3.4s, v2.4s, v0.4s, #90 +; CHECK-NEXT: fcmla v4.4s, v1.4s, v0.4s, #90 +; CHECK-NEXT: fcadd v0.4s, v3.4s, v4.4s, #90 ; CHECK-NEXT: ret entry: %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -484,9 +484,9 @@ ; CHECK-LABEL: mul_negequal: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0 -; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90 -; CHECK-NEXT: fneg v0.4s, v2.4s +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #180 +; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #270 +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -115,15 +115,7 @@ define <4 x float> @simple_add_270_false(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: simple_add_270_false: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: zip1 v4.2s, v0.2s, v2.2s -; CHECK-NEXT: zip2 v0.2s, v0.2s, v2.2s -; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s -; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s -; CHECK-NEXT: fadd v1.2s, v1.2s, v4.2s -; CHECK-NEXT: fsub v0.2s, v0.2s, v2.2s -; CHECK-NEXT: zip1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: fcadd v0.4s, v0.4s, v1.4s, #270 ; CHECK-NEXT: ret entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32>