diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -0,0 +1,53 @@ +//===- ComplexDeinterleavingPass.h - Complex Deinterleaving Pass *- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements generation of target-specific intrinsics to support +// handling of complex number arithmetic and deinterleaving. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H +#define LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H + +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" + +namespace llvm { + +class Function; +class TargetMachine; + +struct ComplexDeinterleavingPass + : public PassInfoMixin { +private: + TargetMachine *TM; + +public: + ComplexDeinterleavingPass(TargetMachine *TM) : TM(TM) {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +enum class ComplexDeinterleavingOperation { + CAdd, + CMulPartial, + // The following 'operations' are used to represent internal states. Backends + // are not expected to try and support these in any capacity. + Shuffle +}; + +enum class ComplexDeinterleavingRotation { + Rotation_0 = 0, + Rotation_90 = 1, + Rotation_180 = 2, + Rotation_270 = 3, +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -79,6 +79,10 @@ /// matching during instruction selection. FunctionPass *createCodeGenPreparePass(); + /// This pass implements generation of target-specific intrinsics to support + /// handling of complex number arithmetic + FunctionPass *createComplexDeinterleavingPass(const TargetMachine *TM); + /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg /// load-linked/store-conditional loops. extern char &AtomicExpandID; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -28,6 +28,7 @@ #include "llvm/ADT/STLArrayExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/ComplexDeinterleavingPass.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/LowLevelType.h" @@ -3063,6 +3064,28 @@ return isOperationLegalOrCustom(Op, VT); } + /// Does this target support complex deinterleaving + virtual bool isComplexDeinterleavingSupported() const { return false; } + + /// Does this target support complex deinterleaving with the given operation + /// and type + virtual bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + return false; + } + + /// Create the IR node for the given complex deinterleaving operation. + /// If one cannot be created using all the given inputs, nullptr should be + /// returned. + virtual Value * + createComplexDeinterleavingIR(Instruction *I, + ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, + Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const { + return nullptr; + } + //===--------------------------------------------------------------------===// // Runtime Library hooks // diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -108,9 +108,10 @@ void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); void initializeCheckDebugMachineModulePass(PassRegistry &); void initializeCodeGenPreparePass(PassRegistry&); +void initializeComplexDeinterleavingLegacyPassPass(PassRegistry&); void initializeConstantHoistingLegacyPassPass(PassRegistry&); void initializeConstantMergeLegacyPassPass(PassRegistry&); -void initializeConstraintEliminationPass(PassRegistry &); +void initializeConstraintEliminationPass(PassRegistry&); void initializeCorrelatedValuePropagationPass(PassRegistry&); void initializeCostModelAnalysisPass(PassRegistry&); void initializeCrossDSOCFIPass(PassRegistry&); @@ -119,7 +120,7 @@ void initializeDAHPass(PassRegistry&); void initializeDCELegacyPassPass(PassRegistry&); void initializeDFAJumpThreadingLegacyPassPass(PassRegistry &); -void initializeDSELegacyPassPass(PassRegistry&); +void initializeDSELegacyPassPass(PassRegistry &); void initializeDataFlowSanitizerLegacyPassPass(PassRegistry &); void initializeDeadMachineInstructionElimPass(PassRegistry&); void initializeDebugifyMachineModulePass(PassRegistry &); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -43,6 +43,7 @@ CodeGenPassBuilder.cpp CodeGenPrepare.cpp CommandFlags.cpp + ComplexDeinterleavingPass.cpp CriticalAntiDepBreaker.cpp DeadMachineInstructionElim.cpp DetectDeadLanes.cpp diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -0,0 +1,872 @@ +//===- ComplexDeinterleavingPass.cpp --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Identification: +// This step is responsible for finding the patterns that can be lowered to +// complex instructions, and building a graph to represent the complex +// structures. Starting from the "Converging Shuffle" (a shuffle that +// reinterleaves the complex components, with a mask of <0, 2, 1, 3>), the +// operands are evaluated and identified as "Composite Nodes" (collections of +// instructions that can potentially be lowered to a single complex +// instruction). This is performed by checking the real and imaginary components +// and tracking the data flow for each component while following the operand +// pairs. Validity of each node is expected to be done upon creation, and any +// validation errors should halt traversal and prevent further graph +// construction. +// +// Replacement: +// This step traverses the graph built up by identification, delegating to the +// target to validate and generate the correct intrinsics, and plumbs them +// together connecting each end of the new intrinsics graph to the existing +// use-def chain. This step is assumed to finish successfully, as all +// information is expected to be correct by this point. +// +// +// Internal data structure: +// ComplexDeinterleavingGraph: +// Keeps references to all the valid CompositeNodes formed as part of the +// transformation, and every Instruction contained within said nodes. It also +// holds onto a reference to the root Instruction, and the root node that should +// replace it. +// +// ComplexDeinterleavingCompositeNode: +// A CompositeNode represents a single transformation point; each node should +// transform into a single complex instruction (ignoring vector splitting, which +// would generate more instructions per node). They are identified in a +// depth-first manner, traversing and identifying the operands of each +// instruction in the order they appear in the IR. +// Each node maintains a reference to its Real and Imaginary instructions, +// as well as any additional instructions that make up the identified operation +// (Internal instructions should only have uses within their containing node). +// A Node also contains the rotation and operation type that it represents. +// Operands contains pointers to other CompositeNodes, acting as the edges in +// the graph. ReplacementValue is the transformed Value* that has been emitted +// to the IR. +// +// Note: If the operation of a Node is Shuffle, only the Real, Imaginary, and +// ReplacementValue fields of that Node are relevant, where the ReplacementValue +// should be pre-populated. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ComplexDeinterleavingPass.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Local.h" +#include + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "complex-deinterleaving" + +STATISTIC(NumComplexTransformations, "Amount of complex patterns transformed"); + +static cl::opt ComplexDeinterleavingEnabled( + "enable-complex-deinterleaving", + cl::desc("Enable generation of complex instructions"), + cl::init(true), cl::Hidden); + +/// Checks the given mask, and determines whether said mask is interleaving. +/// +/// To be interleaving, a mask must alternate between `i` and `i + (Length / +/// 2)`, and must contain all numbers within the range of `[0..Length)` (e.g. a +/// 4x vector interleaving mask would be <0, 2, 1, 3>). +static bool isInterleavingMask(ArrayRef Mask); + +/// Checks the given mask, and determines whether said mask is deinterleaving. +/// +/// To be deinterleaving, a mask must increment in steps of 2, and either start +/// with 0 or 1. +/// (e.g. an 8x vector deinterleaving mask would be either <0, 2, 4, 6> or +/// <1, 3, 5, 7>). +static bool isDeinterleavingMask(ArrayRef Mask); + +namespace { + +class ComplexDeinterleavingLegacyPass : public FunctionPass { +public: + static char ID; + + ComplexDeinterleavingLegacyPass(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) { + initializeComplexDeinterleavingLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Complex Deinterleaving Pass"; + } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } + +private: + const TargetMachine *TM; +}; + +class ComplexDeinterleavingGraph; +struct ComplexDeinterleavingCompositeNode { + + ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op, + Instruction *R, Instruction *I) + : Operation(Op), Real(R), Imag(I) {} + +private: + friend class ComplexDeinterleavingGraph; + using NodePtr = std::shared_ptr; + using RawNodePtr = ComplexDeinterleavingCompositeNode *; + +public: + ComplexDeinterleavingOperation Operation; + Instruction *Real; + Instruction *Imag; + + // Instructions that should only exist within this node, there should be no + // users of these instructions outside the node. An example of these would be + // the multiply instructions of a partial multiply operation. + SmallVector InternalInstructions; + ComplexDeinterleavingRotation Rotation; + SmallVector Operands; + Value *ReplacementNode = nullptr; + + void addInstruction(Instruction *I) { InternalInstructions.push_back(I); } + void addOperand(NodePtr Node) { Operands.push_back(Node.get()); } + + bool hasAllInternalUses(SmallPtrSet &AllInstructions); + + void dump() { dump(dbgs()); } + void dump(raw_ostream &OS) { + auto PrintValue = [&](Value *V) { + if (V) { + OS << "\""; + V->print(OS, true); + OS << "\"\n"; + } else + OS << "nullptr\n"; + }; + auto PrintNodeRef = [&](RawNodePtr Ptr) { + if (Ptr) + OS << Ptr << "\n"; + else + OS << "nullptr\n"; + }; + + OS << "- CompositeNode: " << this << "\n"; + OS << " Real: "; + PrintValue(Real); + OS << " Imag: "; + PrintValue(Imag); + OS << " ReplacementNode: "; + PrintValue(ReplacementNode); + OS << " Operation: " << (int)Operation << "\n"; + OS << " Rotation: " << ((int) Rotation * 90) << "\n"; + OS << " Operands: \n"; + for (const auto &Op : Operands) { + OS << " - "; + PrintNodeRef(Op); + } + OS << " InternalInstructions:\n"; + for (const auto &I : InternalInstructions) { + OS << " - \""; + I->print(OS, true); + OS << "\"\n"; + } + } +}; + +class ComplexDeinterleavingGraph { +public: + using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr; + using RawNodePtr = ComplexDeinterleavingCompositeNode::RawNodePtr; + explicit ComplexDeinterleavingGraph(const TargetLowering *tl) : TL(tl) {} + +private: + const TargetLowering *TL; + Instruction *RootValue; + NodePtr RootNode; + SmallVector CompositeNodes; + SmallPtrSet AllInstructions; + + NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation, + Instruction *R, Instruction *I) { + return std::make_shared(Operation, R, + I); + } + + NodePtr submitCompositeNode(NodePtr Node) { + CompositeNodes.push_back(Node); + AllInstructions.insert(Node->Real); + AllInstructions.insert(Node->Imag); + for (auto *I : Node->InternalInstructions) + AllInstructions.insert(I); + return Node; + } + + NodePtr getContainingComposite(Value *R, Value *I) { + for (const auto &CN : CompositeNodes) { + if (CN->Real == R && CN->Imag == I) + return CN; + } + return nullptr; + } + + /// Identifies a complex partial multiply pattern and its rotation, based on + /// the following patterns + /// + /// 0: r: cr + ar * br + /// i: ci + ar * bi + /// 90: r: cr - ai * bi + /// i: ci + ai * br + /// 180: r: cr - ar * br + /// i: ci - ar * bi + /// 270: r: cr + ai * bi + /// i: ci - ai * br + NodePtr identifyPartialMul(Instruction *Real, Instruction *Imag); + + /// Identify the other branch of a Partial Mul, taking the CommonOperandI that + /// is partially known from identifyPartialMul, filling in the other half of + /// the complex pair. + NodePtr identifyNodeWithImplicitAdd( + Instruction *I, Instruction *J, + std::pair &CommonOperandI); + + /// Identifies a complex add pattern and its rotation, based on the following + /// patterns. + /// + /// 90: r: ar - bi + /// i: ai + br + /// 270: r: ar + bi + /// i: ai - br + NodePtr identifyAdd(Instruction *Real, Instruction *Imag); + + NodePtr identifyNode(Instruction *I, Instruction *J); + + Value *replaceNode(RawNodePtr Node); + +public: + void dump() { dump(dbgs()); } + void dump(raw_ostream &OS) { + for (const auto &Node : CompositeNodes) + Node->dump(OS); + } + + /// Returns false if the deinterleaving operation should be cancelled for the + /// current graph. + bool identifyNodes(Instruction *RootI); + + /// Perform the actual replacement of the underlying instruction graph. + /// Returns false if the deinterleaving operation should be cancelled for the + /// current graph. + void replaceNodes(); +}; + +class ComplexDeinterleaving { +public: + ComplexDeinterleaving(const TargetLowering *tl, const TargetLibraryInfo *tli) + : TL(tl), TLI(tli) {} + bool runOnFunction(Function &F); + +private: + bool evaluateBasicBlock(BasicBlock *B); + + const TargetLowering *TL = nullptr; + const TargetLibraryInfo *TLI = nullptr; +}; + +} // namespace + +char ComplexDeinterleavingLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, + "Complex Deinterleaving", false, false) +INITIALIZE_PASS_END(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, + "Complex Deinterleaving", false, false) + +PreservedAnalyses ComplexDeinterleavingPass::run(Function &F, + FunctionAnalysisManager &AM) { + const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + auto &TLI = AM.getResult(F); + if (!ComplexDeinterleaving(TL, &TLI).runOnFunction(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} + +FunctionPass *llvm::createComplexDeinterleavingPass(const TargetMachine *TM) { + return new ComplexDeinterleavingLegacyPass(TM); +} + +bool ComplexDeinterleavingLegacyPass::runOnFunction(Function &F) { + const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + auto TLI = getAnalysis().getTLI(F); + return ComplexDeinterleaving(TL, &TLI).runOnFunction(F); +} + +bool ComplexDeinterleaving::runOnFunction(Function &F) { + if (!ComplexDeinterleavingEnabled) { + LLVM_DEBUG( + dbgs() << "Complex deinterleaving has been explicitly disabled.\n"); + return false; + } + + if (!TL->isComplexDeinterleavingSupported()) { + LLVM_DEBUG( + dbgs() << "Complex deinterleaving has been disabled, target does " + "not support lowering of complex number operations.\n"); + return false; + } + + bool Changed = false; + for (auto &B : F) + Changed |= evaluateBasicBlock(&B); + + return Changed; +} + +static bool isInterleavingMask(ArrayRef Mask) { + // If the size is not even, it's not an interleaving mask + if ((Mask.size() & 1)) + return false; + + int HalfNumElements = Mask.size() / 2; + for (int Idx = 0; Idx < HalfNumElements; ++Idx) { + int MaskIdx = Idx * 2; + if (Mask[MaskIdx] != Idx || Mask[MaskIdx + 1] != (Idx + HalfNumElements)) + return false; + } + + return true; +} + +static bool isDeinterleavingMask(ArrayRef Mask) { + int Offset = Mask[0]; + int HalfNumElements = Mask.size() / 2; + + for (int Idx = 1; Idx < HalfNumElements; ++Idx) { + if (Mask[Idx] != (Idx * 2) + Offset) + return false; + } + + return true; +} + +bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) { + bool Changed = false; + + SmallVector DeadInstrRoots; + + for (auto &I : *B) { + auto *SVI = dyn_cast(&I); + if(!SVI) + continue; + + // Look for a shufflevector that takes separate vectors of the real and + // imaginary components and recombines them into a single vector. + if (!isInterleavingMask(SVI->getShuffleMask())) + continue; + + ComplexDeinterleavingGraph Graph(TL); + if (!Graph.identifyNodes(SVI)) + continue; + + Graph.replaceNodes(); + DeadInstrRoots.push_back(SVI); + Changed = true; + } + + for (const auto &I : DeadInstrRoots) { + if (!I || I->getParent() == nullptr) + continue; + llvm::RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + } + + return Changed; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyNodeWithImplicitAdd( + Instruction *Real, Instruction *Imag, + std::pair &PartialMatch) { + LLVM_DEBUG(dbgs() << "identifyNodeWithImplicitAdd " << *Real << " / " << *Imag + << "\n"); + + if (!Real->hasOneUse() || !Imag->hasOneUse()) { + LLVM_DEBUG(dbgs() << " - Mul operand has multiple uses.\n"); + return nullptr; + } + + if (Real->getOpcode() != Instruction::FMul || + Imag->getOpcode() != Instruction::FMul) { + LLVM_DEBUG(dbgs() << " - Real or imaginary instruction is not fmul\n"); + return nullptr; + } + + Instruction *R0 = dyn_cast(Real->getOperand(0)); + Instruction *R1 = dyn_cast(Real->getOperand(1)); + Instruction *I0 = dyn_cast(Imag->getOperand(0)); + Instruction *I1 = dyn_cast(Imag->getOperand(1)); + if (!R0 || !R1 || !I0 || !I1) { + LLVM_DEBUG(dbgs() << " - Mul operand not Instruction\n"); + return nullptr; + } + + // A +/+ has a rotation of 0. If any of the operands are fneg, we flip the + // rotations and use the operand. + unsigned Negs = 0; + SmallVector FNegs; + if (R0->getOpcode() == Instruction::FNeg || + R1->getOpcode() == Instruction::FNeg) { + Negs |= 1; + if (R0->getOpcode() == Instruction::FNeg) { + FNegs.push_back(R0); + R0 = dyn_cast(R0->getOperand(0)); + } else { + FNegs.push_back(R1); + R1 = dyn_cast(R1->getOperand(0)); + } + if (!R0 || !R1) + return nullptr; + } + if (I0->getOpcode() == Instruction::FNeg || + I1->getOpcode() == Instruction::FNeg) { + Negs |= 2; + Negs ^= 1; + if (I0->getOpcode() == Instruction::FNeg) { + FNegs.push_back(I0); + I0 = dyn_cast(I0->getOperand(0)); + } else { + FNegs.push_back(I1); + I1 = dyn_cast(I1->getOperand(0)); + } + if (!I0 || !I1) + return nullptr; + } + + ComplexDeinterleavingRotation Rotation = (ComplexDeinterleavingRotation) Negs; + + Instruction *CommonOperand; + Instruction *UncommonRealOp; + Instruction *UncommonImagOp; + + if (R0 == I0 || R0 == I1) { + CommonOperand = R0; + UncommonRealOp = R1; + } else if (R1 == I0 || R1 == I1) { + CommonOperand = R1; + UncommonRealOp = R0; + } else { + LLVM_DEBUG(dbgs() << " - No equal operand\n"); + return nullptr; + } + + UncommonImagOp = (CommonOperand == I0) ? I1 : I0; + if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || + Rotation == ComplexDeinterleavingRotation::Rotation_270) + std::swap(UncommonRealOp, UncommonImagOp); + + // Between identifyPartialMul and here we need to have found a complete valid + // pair from the CommonOperand of each part. + if (Rotation == ComplexDeinterleavingRotation::Rotation_0 || + Rotation == ComplexDeinterleavingRotation::Rotation_180) + PartialMatch.first = CommonOperand; + else + PartialMatch.second = CommonOperand; + + if (!PartialMatch.first || !PartialMatch.second) { + LLVM_DEBUG(dbgs() << " - Incomplete partial match\n"); + return nullptr; + } + + NodePtr CommonNode = identifyNode(PartialMatch.first, + PartialMatch.second); + if (!CommonNode) { + LLVM_DEBUG(dbgs() << " - No CommonNode identified\n"); + return nullptr; + } + + NodePtr UncommonNode = identifyNode(UncommonRealOp, + UncommonImagOp); + if (!UncommonNode) { + LLVM_DEBUG(dbgs() << " - No UncommonNode identified\n"); + return nullptr; + } + + NodePtr Node = prepareCompositeNode( + ComplexDeinterleavingOperation::CMulPartial, Real, Imag); + Node->Rotation = Rotation; + Node->addOperand(CommonNode); + Node->addOperand(UncommonNode); + Node->InternalInstructions.append(FNegs); + return submitCompositeNode(Node); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real, + Instruction *Imag) { + LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag + << "\n"); + // Determine rotation + ComplexDeinterleavingRotation Rotation; + if (Real->getOpcode() == Instruction::FAdd && + Imag->getOpcode() == Instruction::FAdd) + Rotation = ComplexDeinterleavingRotation::Rotation_0; + else if (Real->getOpcode() == Instruction::FSub && + Imag->getOpcode() == Instruction::FAdd) + Rotation = ComplexDeinterleavingRotation::Rotation_90; + else if (Real->getOpcode() == Instruction::FSub && + Imag->getOpcode() == Instruction::FSub) + Rotation = ComplexDeinterleavingRotation::Rotation_180; + else if (Real->getOpcode() == Instruction::FAdd && + Imag->getOpcode() == Instruction::FSub) + Rotation = ComplexDeinterleavingRotation::Rotation_270; + else { + LLVM_DEBUG(dbgs() << " - Unhandled rotation.\n"); + return nullptr; + } + + if (!Real->getFastMathFlags().allowContract() || + !Imag->getFastMathFlags().allowContract()) { + LLVM_DEBUG(dbgs() << " - Contract is missing from the FastMath flags.\n"); + return nullptr; + } + + Value *CR = Real->getOperand(0); + Instruction *RealMulI = dyn_cast(Real->getOperand(1)); + if (!RealMulI) + return nullptr; + Value *CI = Imag->getOperand(0); + Instruction *ImagMulI = dyn_cast(Imag->getOperand(1)); + if (!ImagMulI) + return nullptr; + + if (!RealMulI->hasOneUse() || !ImagMulI->hasOneUse()) { + LLVM_DEBUG(dbgs() << " - Mul instruction has multiple uses\n"); + return nullptr; + } + + Instruction *R0 = dyn_cast(RealMulI->getOperand(0)); + Instruction *R1 = dyn_cast(RealMulI->getOperand(1)); + Instruction *I0 = dyn_cast(ImagMulI->getOperand(0)); + Instruction *I1 = dyn_cast(ImagMulI->getOperand(1)); + if (!R0 || !R1 || !I0 || !I1) { + LLVM_DEBUG(dbgs() << " - Mul operand not Instruction\n"); + return nullptr; + } + + Instruction *CommonOperand; + Instruction *UncommonRealOp; + Instruction *UncommonImagOp; + + if (R0 == I0 || R0 == I1) { + CommonOperand = R0; + UncommonRealOp = R1; + } else if (R1 == I0 || R1 == I1) { + CommonOperand = R1; + UncommonRealOp = R0; + } else { + LLVM_DEBUG(dbgs() << " - No equal operand\n"); + return nullptr; + } + + UncommonImagOp = (CommonOperand == I0) ? I1 : I0; + if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || Rotation == ComplexDeinterleavingRotation::Rotation_270) + std::swap(UncommonRealOp, UncommonImagOp); + + std::pair PartialMatch( + (Rotation == ComplexDeinterleavingRotation::Rotation_0 || Rotation == ComplexDeinterleavingRotation::Rotation_180) ? CommonOperand : nullptr, + (Rotation == ComplexDeinterleavingRotation::Rotation_90 || Rotation == ComplexDeinterleavingRotation::Rotation_270) ? CommonOperand : nullptr); + NodePtr CNode = identifyNodeWithImplicitAdd( + cast(CR), cast(CI), PartialMatch); + if (!CNode) { + LLVM_DEBUG(dbgs() << " - No cnode identified\n"); + return nullptr; + } + + NodePtr UncommonRes = identifyNode(UncommonRealOp, UncommonImagOp); + if (!UncommonRes) { + LLVM_DEBUG(dbgs() << " - No UncommonRes identified\n"); + return nullptr; + } + + assert(PartialMatch.first && PartialMatch.second); + NodePtr CommonRes = identifyNode(PartialMatch.first, PartialMatch.second); + if (!CommonRes) { + LLVM_DEBUG(dbgs() << " - No CommonRes identified\n"); + return nullptr; + } + + NodePtr Node = prepareCompositeNode( + ComplexDeinterleavingOperation::CMulPartial, Real, Imag); + Node->addInstruction(RealMulI); + Node->addInstruction(ImagMulI); + Node->Rotation = Rotation; + Node->addOperand(CommonRes); + Node->addOperand(UncommonRes); + Node->addOperand(CNode); + return submitCompositeNode(Node); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyAdd(Instruction *Real, Instruction *Imag) { + LLVM_DEBUG(dbgs() << "identifyAdd " << *Real << " / " << *Imag << "\n"); + + // Determine rotation + ComplexDeinterleavingRotation Rotation; + if (Real->getOpcode() == Instruction::FSub && + Imag->getOpcode() == Instruction::FAdd) + Rotation = ComplexDeinterleavingRotation::Rotation_90; + else if (Real->getOpcode() == Instruction::FAdd && + Imag->getOpcode() == Instruction::FSub) + Rotation = ComplexDeinterleavingRotation::Rotation_270; + else { + LLVM_DEBUG(dbgs() << " - Unhandled case, rotation is not assigned.\n"); + return nullptr; + } + + auto *AR = cast(Real->getOperand(0)); + auto *BI = cast(Real->getOperand(1)); + auto *AI = cast(Imag->getOperand(0)); + auto *BR = cast(Imag->getOperand(1)); + + NodePtr ResA = identifyNode(AR, AI); + if (!ResA) { + LLVM_DEBUG(dbgs() << " - AR/AI is not identified as a composite node.\n"); + return nullptr; + } + NodePtr ResB = identifyNode(BR, BI); + if (!ResB) { + LLVM_DEBUG(dbgs() << " - BR/BI is not identified as a composite node.\n"); + return nullptr; + } + + NodePtr Node = + prepareCompositeNode(ComplexDeinterleavingOperation::CAdd, Real, Imag); + Node->Rotation = Rotation; + Node->addOperand(ResA); + Node->addOperand(ResB); + return submitCompositeNode(Node); +} + +static bool isInstructionPairAdd(Instruction *A, Instruction *B) { + unsigned OpcA = A->getOpcode(); + unsigned OpcB = B->getOpcode(); + return (OpcA == Instruction::FSub && OpcB == Instruction::FAdd) || + (OpcA == Instruction::FAdd && OpcB == Instruction::FSub); +} + +static bool isInstructionPairMul(Instruction *A, Instruction *B) { + auto Pattern = + m_BinOp(m_FMul(m_Value(), m_Value()), m_FMul(m_Value(), m_Value())); + + return match(A, Pattern) && match(B, Pattern); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) { + LLVM_DEBUG(dbgs() << "identifyNode on " << *Real << " / " << *Imag << "\n"); + if (NodePtr CN = getContainingComposite(Real, Imag)) { + LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); + return CN; + } + + auto *RealShuffle = dyn_cast(Real); + auto *ImagShuffle = dyn_cast(Imag); + if (RealShuffle && ImagShuffle) { + Value *RealOp1 = RealShuffle->getOperand(1); + if (!isa(RealOp1) && !isa(RealOp1)) { + LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n"); + return nullptr; + } + Value *ImagOp1 = ImagShuffle->getOperand(1); + if (!isa(ImagOp1) && !isa(ImagOp1)) { + LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n"); + return nullptr; + } + + Value *RealOp0 = RealShuffle->getOperand(0); + Value *ImagOp0 = ImagShuffle->getOperand(0); + + if (RealOp0 != ImagOp0) { + LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n"); + return nullptr; + } + + ArrayRef RealMask = RealShuffle->getShuffleMask(); + ArrayRef ImagMask = ImagShuffle->getShuffleMask(); + if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) { + LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n"); + return nullptr; + } + + if (RealMask[0] != 0 || ImagMask[0] != 1) { + LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n"); + return nullptr; + } + + // Type checking, the shuffle type should be a vector type of the same + // scalar type, but half the size + auto CheckType = [&](ShuffleVectorInst *Shuffle) { + Value *Op = Shuffle->getOperand(0); + auto *ShuffleTy = cast(Shuffle->getType()); + auto *OpTy = cast(Op->getType()); + + if (OpTy->getScalarType() != ShuffleTy->getScalarType()) + return false; + if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements()) + return false; + + return true; + }; + + auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool { + if (!CheckType(Shuffle)) + return false; + + ArrayRef Mask = Shuffle->getShuffleMask(); + int Last = *Mask.rbegin(); + + Value *Op = Shuffle->getOperand(0); + auto *OpTy = cast(Op->getType()); + int NumElements = OpTy->getNumElements(); + + // Ensure that the deinterleaving shuffle only pulls from the first + // shuffle operand. + return Last < NumElements; + }; + + if (RealShuffle->getType() != ImagShuffle->getType()) { + LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n"); + return nullptr; + } + if (!CheckDeinterleavingShuffle(RealShuffle)) { + LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n"); + return nullptr; + } + if (!CheckDeinterleavingShuffle(ImagShuffle)) { + LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n"); + return nullptr; + } + + NodePtr PlaceholderNode = + prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Shuffle, + RealShuffle, ImagShuffle); + PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0); + return submitCompositeNode(PlaceholderNode); + } + if (RealShuffle || ImagShuffle) + return nullptr; + + auto *VTy = cast(Real->getType()); + auto *NewVTy = + FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + + if (TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy) && + isInstructionPairMul(Real, Imag)) { + return identifyPartialMul(Real, Imag); + } + + if (TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CAdd, NewVTy) && + isInstructionPairAdd(Real, Imag)) { + return identifyAdd(Real, Imag); + } + + return nullptr; +} + +bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { + Instruction *Real; + Instruction *Imag; + if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag)))) + return false; + + RootValue = RootI; + AllInstructions.insert(RootI); + RootNode = identifyNode(Real, Imag); + + LLVM_DEBUG({ + Function *F = RootI->getFunction(); + BasicBlock *B = RootI->getParent(); + dbgs() << "Complex deinterleaving graph for " << F->getName() + << "::" << B->getName() << ".\n"; + dump(dbgs()); + dbgs() << "\n"; + }); + + // Check all instructions have internal uses + for (const auto &Node : CompositeNodes) { + if (!Node->hasAllInternalUses(AllInstructions)) { + LLVM_DEBUG(dbgs() << " - Invalid internal uses\n"); + return false; + } + } + return RootNode != nullptr; +} + +Value *ComplexDeinterleavingGraph::replaceNode( + ComplexDeinterleavingGraph::RawNodePtr Node) { + if (Node->ReplacementNode) + return Node->ReplacementNode; + + Value *Input0 = replaceNode(Node->Operands[0]); + Value *Input1 = replaceNode(Node->Operands[1]); + Value *Accumulator = + Node->Operands.size() > 2 ? replaceNode(Node->Operands[2]) : nullptr; + + assert(Input0->getType() == Input1->getType() && + "Node inputs need to be of the same type"); + + Node->ReplacementNode = TL->createComplexDeinterleavingIR( + Node->Real, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); + + assert(Node->ReplacementNode && "Target failed to create Intrinsic call."); + NumComplexTransformations += 1; + return Node->ReplacementNode; +} + +void ComplexDeinterleavingGraph::replaceNodes() { + Value *R = replaceNode(RootNode.get()); + assert(R && "Unable to find replacement for RootValue"); + RootValue->replaceAllUsesWith(R); +} + +bool ComplexDeinterleavingCompositeNode::hasAllInternalUses( + SmallPtrSet &AllInstructions) { + if (Operation == ComplexDeinterleavingOperation::Shuffle) + return true; + + for (auto *User : Real->users()) { + if (!AllInstructions.contains(cast(User))) + return false; + } + for (auto *User : Imag->users()) { + if (!AllInstructions.contains(cast(User))) + return false; + } + for (auto *I : InternalInstructions) { + for (auto *User : I->users()) { + if (!AllInstructions.contains(cast(User))) + return false; + } + } + return true; +} \ No newline at end of file diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -734,6 +734,15 @@ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool isComplexDeinterleavingSupported() const override; + bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const override; + + Value *createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const override; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21768,3 +21768,96 @@ MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } + +bool ARMTargetLowering::isComplexDeinterleavingSupported() const { + return Subtarget->hasMVEFloatOps(); +} + +bool ARMTargetLowering::isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + auto *VTy = dyn_cast(Ty); + if (!VTy) + return false; + + auto *ScalarTy = VTy->getScalarType(); + unsigned NumElements = VTy->getNumElements(); + + unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; + if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth)) + return false; + + // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32 + return ScalarTy->isHalfTy() || ScalarTy->isFloatTy(); +} + +Value *ARMTargetLowering::createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { + + FixedVectorType *Ty = cast(InputA->getType()); + + IRBuilder<> B(I); + + unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); + + assert(TyWidth >= 128); + + if (TyWidth > 128) { + int Stride = Ty->getNumElements() / 2; + auto SplitSeq = llvm::seq(0, Ty->getNumElements()); + auto SplitSeqVec = llvm::to_vector(SplitSeq); + ArrayRef LowerSplitMask(&SplitSeqVec[0], Stride); + ArrayRef UpperSplitMask(&SplitSeqVec[Stride], Stride); + + auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask); + auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask); + auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask); + auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask); + Value *LowerSplitAcc = nullptr; + Value *UpperSplitAcc = nullptr; + + if (Accumulator) { + LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask); + UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask); + } + + auto *LowerSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + auto *UpperSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + + ArrayRef JoinMask(&SplitSeqVec[0], Ty->getNumElements()); + return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); + } + + auto *IntTy = Type::getInt32Ty(B.getContext()); + + ConstantInt *ConstRotation = nullptr; + if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + ConstRotation = ConstantInt::get(IntTy, (int) Rotation); + + if (Accumulator) + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty, + {ConstRotation, Accumulator, InputB, InputA}); + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty, + {ConstRotation, InputB, InputA}); + } + + if (OperationType == ComplexDeinterleavingOperation::CAdd) { + // 1 means the value is not halved. + auto *ConstHalving = ConstantInt::get(IntTy, 1); + + if (Rotation == ComplexDeinterleavingRotation::Rotation_90) + ConstRotation = ConstantInt::get(IntTy, 0); + else if (Rotation == ComplexDeinterleavingRotation::Rotation_270) + ConstRotation = ConstantInt::get(IntTy, 1); + + if (!ConstRotation) + return nullptr; // Invalid rotation for arm_mve_vcaddq + + return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty, + {ConstHalving, ConstRotation, InputA, InputB}); + } + + return nullptr; +} diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -426,9 +426,13 @@ TargetPassConfig::addIRPasses(); // Run the parallel DSP pass. - if (getOptLevel() == CodeGenOpt::Aggressive) + if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createARMParallelDSPPass()); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexDeinterleavingPass(TM)); + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Transform functions to use DSP intrinsics +; CHECK-NEXT: Complex Deinterleaving Pass ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: Type Promotion ; CHECK-NEXT: CodeGen Prepare diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll @@ -59,39 +59,7 @@ define arm_aapcs_vfpcc <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: complex_add_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vins.f16 s12, s10 -; CHECK-NEXT: vmovx.f16 s13, s6 -; CHECK-NEXT: vmovx.f16 s10, s7 -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vins.f16 s13, s10 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vins.f16 s8, s1 -; CHECK-NEXT: vins.f16 s0, s10 -; CHECK-NEXT: vmovx.f16 s10, s3 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s9, s3 -; CHECK-NEXT: vins.f16 s1, s10 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vadd.f16 q2, q3, q2 -; CHECK-NEXT: vsub.f16 q1, q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s6, s4 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vins.f16 s5, s9 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov.f32 s1, s6 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s8 +; CHECK-NEXT: vcadd.f16 q0, q1, q0, #90 ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> @@ -108,72 +76,8 @@ define arm_aapcs_vfpcc <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-LABEL: complex_add_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.f32 s17, s2 -; CHECK-NEXT: vins.f16 s16, s1 -; CHECK-NEXT: vmovx.f16 s1, s1 -; CHECK-NEXT: vins.f16 s17, s3 -; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vmovx.f16 s18, s9 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vmovx.f16 s3, s3 -; CHECK-NEXT: vins.f16 s20, s18 -; CHECK-NEXT: vmovx.f16 s21, s10 -; CHECK-NEXT: vmovx.f16 s18, s11 -; CHECK-NEXT: vmovx.f16 s22, s12 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vmovx.f16 s3, s5 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vmov.f32 s19, s6 -; CHECK-NEXT: vins.f16 s21, s18 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vmovx.f16 s23, s14 -; CHECK-NEXT: vmovx.f16 s24, s15 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmovx.f16 s3, s6 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vins.f16 s14, s15 -; CHECK-NEXT: vins.f16 s12, s13 -; CHECK-NEXT: vins.f16 s8, s9 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vins.f16 s18, s5 -; CHECK-NEXT: vins.f16 s19, s7 -; CHECK-NEXT: vins.f16 s23, s24 -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vadd.f16 q4, q5, q4 -; CHECK-NEXT: vsub.f16 q2, q2, q0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s5, s10 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vmovx.f16 s7, s11 -; CHECK-NEXT: vins.f16 s11, s19 -; CHECK-NEXT: vins.f16 s12, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vins.f16 s8, s16 -; CHECK-NEXT: vins.f16 s10, s18 -; CHECK-NEXT: vins.f16 s9, s17 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vmov.f32 s3, s12 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vcadd.f16 q0, q2, q0, #90 +; CHECK-NEXT: vcadd.f16 q1, q3, q1, #90 ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> @@ -190,144 +94,21 @@ define arm_aapcs_vfpcc <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) { ; CHECK-LABEL: complex_add_v32f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vcadd.f16 q0, q4, q0, #90 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #48 +; CHECK-NEXT: vcadd.f16 q1, q4, q1, #90 +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: add r0, sp, #64 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vmovx.f16 s18, s1 -; CHECK-NEXT: add r0, sp, #80 -; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vmovx.f16 s17, s2 -; CHECK-NEXT: vmovx.f16 s18, s3 -; CHECK-NEXT: vldrw.u32 q7, [r0] -; CHECK-NEXT: vmov.f32 s20, s24 -; CHECK-NEXT: vins.f16 s17, s18 -; CHECK-NEXT: vmovx.f16 s18, s4 -; CHECK-NEXT: vmovx.f16 s22, s5 -; CHECK-NEXT: vmovx.f16 s19, s6 -; CHECK-NEXT: vmovx.f16 s23, s7 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmovx.f16 s1, s25 -; CHECK-NEXT: vmovx.f16 s24, s24 -; CHECK-NEXT: vmov.f32 s21, s26 -; CHECK-NEXT: vins.f16 s20, s25 -; CHECK-NEXT: vins.f16 s18, s22 -; CHECK-NEXT: vmov.f32 s22, s28 -; CHECK-NEXT: vins.f16 s19, s23 -; CHECK-NEXT: vmov.f32 s23, s30 -; CHECK-NEXT: vins.f16 s24, s1 -; CHECK-NEXT: vmovx.f16 s25, s26 -; CHECK-NEXT: vmovx.f16 s1, s27 -; CHECK-NEXT: vins.f16 s21, s27 -; CHECK-NEXT: vins.f16 s25, s1 -; CHECK-NEXT: vmovx.f16 s26, s28 -; CHECK-NEXT: vmovx.f16 s1, s29 -; CHECK-NEXT: vins.f16 s22, s29 -; CHECK-NEXT: vins.f16 s23, s31 -; CHECK-NEXT: add r0, sp, #112 -; CHECK-NEXT: vins.f16 s26, s1 -; CHECK-NEXT: vmovx.f16 s1, s31 -; CHECK-NEXT: vmovx.f16 s27, s30 -; CHECK-NEXT: vsub.f16 q4, q5, q4 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: vins.f16 s27, s1 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmovx.f16 s28, s8 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vmovx.f16 s6, s23 -; CHECK-NEXT: vadd.f16 q0, q6, q0 -; CHECK-NEXT: vmovx.f16 s27, s22 -; CHECK-NEXT: vmovx.f16 s4, s16 -; CHECK-NEXT: vins.f16 s16, s0 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vins.f16 s27, s6 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s5, s18 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vins.f16 s18, s2 -; CHECK-NEXT: vmovx.f16 s26, s20 -; CHECK-NEXT: vmovx.f16 s2, s21 -; CHECK-NEXT: vins.f16 s28, s6 -; CHECK-NEXT: vmovx.f16 s29, s10 -; CHECK-NEXT: vmovx.f16 s6, s11 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmov.f32 s0, s12 -; CHECK-NEXT: vins.f16 s26, s2 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vins.f16 s29, s6 -; CHECK-NEXT: vmovx.f16 s6, s13 -; CHECK-NEXT: vmovx.f16 s30, s12 -; CHECK-NEXT: add r0, sp, #96 -; CHECK-NEXT: vins.f16 s30, s6 -; CHECK-NEXT: vins.f16 s0, s13 -; CHECK-NEXT: vins.f16 s2, s15 -; CHECK-NEXT: vmovx.f16 s6, s15 -; CHECK-NEXT: vmovx.f16 s31, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vins.f16 s31, s6 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vmovx.f16 s24, s12 -; CHECK-NEXT: vmovx.f16 s6, s13 -; CHECK-NEXT: vmovx.f16 s25, s14 -; CHECK-NEXT: vins.f16 s24, s6 -; CHECK-NEXT: vmovx.f16 s6, s15 -; CHECK-NEXT: vins.f16 s14, s15 -; CHECK-NEXT: vins.f16 s22, s23 -; CHECK-NEXT: vins.f16 s20, s21 -; CHECK-NEXT: vins.f16 s12, s13 -; CHECK-NEXT: vins.f16 s8, s9 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vins.f16 s25, s6 -; CHECK-NEXT: vmov.f32 s13, s14 -; CHECK-NEXT: vmovx.f16 s7, s19 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vins.f16 s19, s3 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vadd.f16 q2, q6, q2 -; CHECK-NEXT: vmov.f32 s15, s22 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vsub.f16 q5, q3, q7 -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vmovx.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s13, s22 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s13, s0 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vins.f16 s24, s0 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmovx.f16 s26, s21 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vins.f16 s22, s10 -; CHECK-NEXT: vmovx.f16 s15, s23 -; CHECK-NEXT: vins.f16 s23, s11 -; CHECK-NEXT: vins.f16 s26, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vins.f16 s20, s8 -; CHECK-NEXT: vins.f16 s21, s9 -; CHECK-NEXT: vins.f16 s17, s1 -; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: vins.f16 s15, s0 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s4, s18 -; CHECK-NEXT: vmov.f32 s12, s22 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmov.f32 s10, s21 -; CHECK-NEXT: vmov.f32 s14, s23 -; CHECK-NEXT: vmov.f32 s3, s24 -; CHECK-NEXT: vmov.f32 s11, s26 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vcadd.f16 q2, q4, q2, #90 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vcadd.f16 q3, q4, q3, #90 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll @@ -76,44 +76,9 @@ define arm_aapcs_vfpcc <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: complex_mul_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s9, s2 -; CHECK-NEXT: vmovx.f16 s10, s3 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vins.f16 s9, s10 -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vmovx.f16 s13, s6 -; CHECK-NEXT: vins.f16 s12, s10 -; CHECK-NEXT: vmovx.f16 s10, s7 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vins.f16 s13, s10 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vmul.f16 q4, q3, q0 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vfma.f16 q4, q1, q2 -; CHECK-NEXT: vmul.f16 q2, q2, q3 -; CHECK-NEXT: vneg.f16 q2, q2 -; CHECK-NEXT: vfma.f16 q2, q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vins.f16 s8, s16 -; CHECK-NEXT: vins.f16 s9, s17 -; CHECK-NEXT: vins.f16 s6, s0 +; CHECK-NEXT: vcmul.f16 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f16 q2, q0, q1, #90 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> @@ -134,75 +99,15 @@ define arm_aapcs_vfpcc <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-LABEL: complex_mul_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmovx.f16 s19, s6 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vmovx.f16 s18, s1 -; CHECK-NEXT: vins.f16 s19, s24 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmovx.f16 s25, s10 -; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vmovx.f16 s17, s2 -; CHECK-NEXT: vmovx.f16 s18, s3 -; CHECK-NEXT: vins.f16 s25, s8 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vmovx.f16 s26, s12 -; CHECK-NEXT: vins.f16 s17, s18 -; CHECK-NEXT: vmovx.f16 s18, s4 -; CHECK-NEXT: vmovx.f16 s22, s5 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmovx.f16 s1, s9 -; CHECK-NEXT: vins.f16 s26, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 -; CHECK-NEXT: vmovx.f16 s27, s14 -; CHECK-NEXT: vins.f16 s18, s22 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vins.f16 s24, s1 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s27, s8 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s22, s12 -; CHECK-NEXT: vins.f16 s20, s9 -; CHECK-NEXT: vmov.f32 s23, s14 -; CHECK-NEXT: vins.f16 s21, s11 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vins.f16 s22, s13 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vins.f16 s23, s15 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vmul.f16 q1, q4, q6 -; CHECK-NEXT: vmul.f16 q2, q6, q0 -; CHECK-NEXT: vneg.f16 q3, q1 -; CHECK-NEXT: vfma.f16 q3, q5, q0 -; CHECK-NEXT: vfma.f16 q2, q5, q4 -; CHECK-NEXT: vmovx.f16 s4, s12 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s5, s14 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s12, s8 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmovx.f16 s7, s15 -; CHECK-NEXT: vins.f16 s15, s11 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vins.f16 s14, s10 -; CHECK-NEXT: vins.f16 s13, s9 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmov q0, q3 -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vmov.f32 s3, s8 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcmul.f16 q4, q0, q2, #0 +; CHECK-NEXT: vcmla.f16 q4, q0, q2, #90 +; CHECK-NEXT: vcmul.f16 q2, q1, q3, #0 +; CHECK-NEXT: vcmla.f16 q2, q1, q3, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> @@ -223,157 +128,29 @@ define arm_aapcs_vfpcc <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) { ; CHECK-LABEL: complex_mul_v32f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add r0, sp, #48 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: vcmul.f16 q0, q0, q5, #0 +; CHECK-NEXT: vcmla.f16 q0, q4, q5, #90 +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: add r0, sp, #80 +; CHECK-NEXT: vcmul.f16 q4, q1, q5, #0 +; CHECK-NEXT: vcmla.f16 q4, q1, q5, #90 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add r0, sp, #96 -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: add r0, sp, #112 -; CHECK-NEXT: vldrw.u32 q7, [r0] -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmovx.f16 s16, s24 -; CHECK-NEXT: vmovx.f16 s18, s25 -; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vmovx.f16 s17, s26 -; CHECK-NEXT: vmovx.f16 s18, s27 -; CHECK-NEXT: vmovx.f16 s19, s29 -; CHECK-NEXT: vins.f16 s17, s18 -; CHECK-NEXT: vmovx.f16 s18, s28 -; CHECK-NEXT: vins.f16 s18, s19 -; CHECK-NEXT: vmovx.f16 s19, s30 -; CHECK-NEXT: vmovx.f16 s8, s31 -; CHECK-NEXT: vmov.f32 s20, s0 -; CHECK-NEXT: vins.f16 s19, s8 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.f32 s21, s2 -; CHECK-NEXT: vmov.f32 s22, s4 -; CHECK-NEXT: vins.f16 s20, s1 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vmovx.f16 s8, s3 -; CHECK-NEXT: vins.f16 s21, s3 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s22, s5 -; CHECK-NEXT: vins.f16 s23, s7 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmovx.f16 s3, s6 -; CHECK-NEXT: vins.f16 s26, s27 -; CHECK-NEXT: vins.f16 s30, s31 -; CHECK-NEXT: vins.f16 s28, s29 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vins.f16 s24, s25 -; CHECK-NEXT: vmov.f32 s25, s26 -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vmul.f16 q2, q4, q5 -; CHECK-NEXT: vmov.f32 s26, s28 -; CHECK-NEXT: add r0, sp, #128 -; CHECK-NEXT: vmov.f32 s27, s30 -; CHECK-NEXT: vfma.f16 q2, q6, q0 -; CHECK-NEXT: vmul.f16 q0, q0, q4 -; CHECK-NEXT: vneg.f16 q4, q0 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vfma.f16 q4, q6, q5 -; CHECK-NEXT: vmovx.f16 s20, s12 -; CHECK-NEXT: vmovx.f16 s4, s16 -; CHECK-NEXT: vmovx.f16 s5, s18 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s0, s13 -; CHECK-NEXT: vins.f16 s16, s8 -; CHECK-NEXT: vins.f16 s18, s10 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s20, s0 -; CHECK-NEXT: vmovx.f16 s21, s14 -; CHECK-NEXT: vmovx.f16 s0, s15 -; CHECK-NEXT: vmovx.f16 s22, s8 -; CHECK-NEXT: vins.f16 s21, s0 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: add r0, sp, #144 -; CHECK-NEXT: vins.f16 s22, s6 -; CHECK-NEXT: vmovx.f16 s23, s10 -; CHECK-NEXT: vmovx.f16 s6, s11 -; CHECK-NEXT: vmov.f32 s24, s0 -; CHECK-NEXT: vldrw.u32 q7, [r0] -; CHECK-NEXT: vins.f16 s23, s6 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vins.f16 s24, s1 -; CHECK-NEXT: vins.f16 s0, s6 -; CHECK-NEXT: vmovx.f16 s6, s3 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vmov.f32 s25, s2 -; CHECK-NEXT: vins.f16 s1, s6 -; CHECK-NEXT: vmovx.f16 s6, s29 -; CHECK-NEXT: vmovx.f16 s2, s28 -; CHECK-NEXT: vins.f16 s25, s3 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vins.f16 s8, s9 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vins.f16 s14, s15 -; CHECK-NEXT: vmovx.f16 s6, s31 -; CHECK-NEXT: vmovx.f16 s3, s30 -; CHECK-NEXT: vmov.f32 s26, s28 -; CHECK-NEXT: vmov.f32 s27, s30 -; CHECK-NEXT: vins.f16 s12, s13 -; CHECK-NEXT: vmov.f32 s13, s14 -; CHECK-NEXT: vins.f16 s3, s6 -; CHECK-NEXT: vmov.f32 s14, s8 -; CHECK-NEXT: vins.f16 s26, s29 -; CHECK-NEXT: vmov.f32 s15, s10 -; CHECK-NEXT: vins.f16 s27, s31 -; CHECK-NEXT: vmul.f16 q7, q0, q3 -; CHECK-NEXT: vmul.f16 q0, q5, q0 -; CHECK-NEXT: vfma.f16 q7, q6, q5 -; CHECK-NEXT: vneg.f16 q5, q0 -; CHECK-NEXT: vfma.f16 q5, q6, q3 -; CHECK-NEXT: vmovx.f16 s0, s28 -; CHECK-NEXT: vmovx.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s13, s22 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s0, s30 -; CHECK-NEXT: vins.f16 s13, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vmovx.f16 s7, s19 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmovx.f16 s26, s21 -; CHECK-NEXT: vins.f16 s24, s0 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmovx.f16 s0, s29 -; CHECK-NEXT: vins.f16 s22, s30 -; CHECK-NEXT: vmovx.f16 s15, s23 -; CHECK-NEXT: vins.f16 s23, s31 -; CHECK-NEXT: vins.f16 s26, s0 -; CHECK-NEXT: vmovx.f16 s0, s31 -; CHECK-NEXT: vins.f16 s20, s28 -; CHECK-NEXT: vins.f16 s21, s29 -; CHECK-NEXT: vins.f16 s17, s1 -; CHECK-NEXT: vins.f16 s19, s3 +; CHECK-NEXT: vcmul.f16 q5, q2, q1, #0 +; CHECK-NEXT: vcmla.f16 q5, q2, q1, #90 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: vins.f16 s15, s0 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s4, s18 -; CHECK-NEXT: vmov.f32 s12, s22 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmov.f32 s10, s21 -; CHECK-NEXT: vmov.f32 s14, s23 -; CHECK-NEXT: vmov.f32 s3, s24 -; CHECK-NEXT: vmov.f32 s11, s26 -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vcmul.f16 q6, q3, q1, #0 +; CHECK-NEXT: vcmla.f16 q6, q3, q1, #90 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q3, q6 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll @@ -27,19 +27,8 @@ define arm_aapcs_vfpcc <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: complex_add_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vadd.f32 q2, q3, q2 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vsub.f32 q1, q1, q0 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s0, s4 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vcadd.f32 q2, q1, q0, #90 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> @@ -56,34 +45,13 @@ define arm_aapcs_vfpcc <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: complex_add_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov.f32 s4, s0 -; CHECK-NEXT: vmov.f32 s5, s2 -; CHECK-NEXT: vmov.f32 s20, s9 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s7, s18 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vmov.f32 s23, s15 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vadd.f32 q1, q5, q1 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vsub.f32 q2, q2, q0 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s0, s8 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcadd.f32 q4, q2, q0, #90 +; CHECK-NEXT: vcadd.f32 q2, q3, q1, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> @@ -102,64 +70,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: add r3, sp, #80 +; CHECK-NEXT: add r3, sp, #64 +; CHECK-NEXT: add r2, sp, #80 ; CHECK-NEXT: vldrw.u32 q5, [r3] -; CHECK-NEXT: vldrw.u32 q6, [r2] -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov.f32 s4, s0 -; CHECK-NEXT: vmov.f32 s5, s2 -; CHECK-NEXT: add r0, sp, #96 -; CHECK-NEXT: vmov.f32 s28, s25 -; CHECK-NEXT: add r1, sp, #112 -; CHECK-NEXT: vmov.f32 s29, s27 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s25, s26 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.f32 s26, s20 -; CHECK-NEXT: vmov.f32 s27, s22 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s7, s18 -; CHECK-NEXT: vsub.f32 q4, q6, q0 -; CHECK-NEXT: vmov.f32 s30, s21 -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vmov.f32 s31, s23 -; CHECK-NEXT: vldrw.u32 q5, [r1] -; CHECK-NEXT: vadd.f32 q1, q7, q1 -; CHECK-NEXT: vmov.f32 s0, s16 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s4, s18 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmov.f32 s16, s8 -; CHECK-NEXT: vmov.f32 s17, s10 -; CHECK-NEXT: vmov.f32 s28, s25 -; CHECK-NEXT: vmov.f32 s29, s27 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s25, s26 -; CHECK-NEXT: vmov.f32 s18, s12 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s30, s21 -; CHECK-NEXT: vmov.f32 s31, s23 -; CHECK-NEXT: vmov.f32 s10, s13 -; CHECK-NEXT: vadd.f32 q4, q7, q4 -; CHECK-NEXT: vmov.f32 s11, s15 -; CHECK-NEXT: vmov.f32 s26, s20 -; CHECK-NEXT: vmov.f32 s27, s22 -; CHECK-NEXT: vsub.f32 q3, q6, q2 -; CHECK-NEXT: vmov.f32 s9, s16 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov.f32 s17, s18 -; CHECK-NEXT: vmov.f32 s16, s14 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s8, s12 -; CHECK-NEXT: vmov.f32 s10, s13 -; CHECK-NEXT: vmov q3, q4 +; CHECK-NEXT: add r1, sp, #96 +; CHECK-NEXT: add r0, sp, #112 +; CHECK-NEXT: vcadd.f32 q4, q5, q0, #90 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vcadd.f32 q5, q0, q1, #90 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vcadd.f32 q6, q0, q2, #90 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vcadd.f32 q7, q0, q3, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q3, q7 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll @@ -32,25 +32,9 @@ define arm_aapcs_vfpcc <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: complex_mul_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s0 -; CHECK-NEXT: vmov.f32 s16, s5 -; CHECK-NEXT: vmov.f32 s13, s2 -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmul.f32 q2, q4, q3 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vfma.f32 q2, q1, q0 -; CHECK-NEXT: vmul.f32 q0, q0, q4 -; CHECK-NEXT: vneg.f32 q4, q0 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vfma.f32 q4, q1, q3 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s0, s16 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -71,37 +55,15 @@ define arm_aapcs_vfpcc <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: complex_mul_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov.f32 s20, s0 -; CHECK-NEXT: vmov.f32 s21, s2 -; CHECK-NEXT: vmov.f32 s24, s9 -; CHECK-NEXT: vmov.f32 s25, s11 -; CHECK-NEXT: vmov.f32 s22, s16 -; CHECK-NEXT: vmov.f32 s23, s18 -; CHECK-NEXT: vmov.f32 s26, s13 -; CHECK-NEXT: vmov.f32 s27, s15 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmul.f32 q1, q6, q5 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vfma.f32 q1, q2, q0 -; CHECK-NEXT: vmul.f32 q0, q0, q6 -; CHECK-NEXT: vneg.f32 q3, q0 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vfma.f32 q3, q2, q5 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s0, s12 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcmul.f32 q4, q0, q2, #0 +; CHECK-NEXT: vcmla.f32 q4, q0, q2, #90 +; CHECK-NEXT: vcmul.f32 q2, q1, q3, #0 +; CHECK-NEXT: vcmla.f32 q2, q1, q3, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> @@ -124,75 +86,26 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r2, sp, #96 -; CHECK-NEXT: add r3, sp, #112 +; CHECK-NEXT: add r3, sp, #64 +; CHECK-NEXT: add r2, sp, #80 ; CHECK-NEXT: vldrw.u32 q5, [r3] -; CHECK-NEXT: vldrw.u32 q4, [r2] -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s24, s1 -; CHECK-NEXT: vmov.f32 s12, s17 -; CHECK-NEXT: add r0, sp, #144 -; CHECK-NEXT: vmov.f32 s13, s19 -; CHECK-NEXT: add r1, sp, #128 -; CHECK-NEXT: vmov.f32 s25, s3 -; CHECK-NEXT: vmov.f32 s14, s21 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vmov.f32 s26, s5 -; CHECK-NEXT: vmov.f32 s27, s7 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmul.f32 q7, q6, q3 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vmul.f32 q1, q3, q0 -; CHECK-NEXT: vmov.f32 s17, s18 -; CHECK-NEXT: vmov.f32 s18, s20 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmov.f32 s19, s22 -; CHECK-NEXT: vneg.f32 q5, q7 -; CHECK-NEXT: vfma.f32 q5, q4, q0 -; CHECK-NEXT: vfma.f32 q1, q4, q6 -; CHECK-NEXT: vmov.f32 s0, s20 -; CHECK-NEXT: vldrw.u32 q6, [r1] -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s2, s21 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s28, s25 -; CHECK-NEXT: vmov.f32 s29, s27 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s30, s13 -; CHECK-NEXT: vmov.f32 s31, s15 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s4, s22 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s23 -; CHECK-NEXT: vmov.f32 s22, s0 -; CHECK-NEXT: vmov.f32 s23, s2 -; CHECK-NEXT: vmul.f32 q0, q2, q7 -; CHECK-NEXT: vmov.f32 s25, s26 -; CHECK-NEXT: vmul.f32 q4, q7, q5 -; CHECK-NEXT: vmov.f32 s26, s12 -; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vmov.f32 s27, s14 -; CHECK-NEXT: vfma.f32 q4, q6, q2 -; CHECK-NEXT: vfma.f32 q0, q6, q5 -; CHECK-NEXT: vmov.f32 s9, s16 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov.f32 s17, s18 -; CHECK-NEXT: vmov.f32 s16, s2 -; CHECK-NEXT: vmov.f32 s18, s3 -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add r1, sp, #96 +; CHECK-NEXT: add r0, sp, #112 +; CHECK-NEXT: vcmul.f32 q4, q0, q5, #0 +; CHECK-NEXT: vcmla.f32 q4, q0, q5, #90 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vcmul.f32 q5, q1, q0, #0 +; CHECK-NEXT: vcmla.f32 q5, q1, q0, #90 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vcmul.f32 q6, q2, q0, #0 +; CHECK-NEXT: vcmla.f32 q6, q2, q0, #90 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vcmul.f32 q7, q3, q0, #0 +; CHECK-NEXT: vcmla.f32 q7, q3, q0, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q3, q7 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + +; Expected to not transform +define arm_aapcs_vfpcc <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_add_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f64 d3, d3, d0 +; CHECK-NEXT: vsub.f64 d2, d2, d1 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x double> %b.real, %a.imag + %1 = fadd fast <1 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> + ret <2 x double> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_add_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f64 d5, d5, d0 +; CHECK-NEXT: vsub.f64 d4, d4, d1 +; CHECK-NEXT: vadd.f64 d7, d7, d2 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vsub.f64 d6, d6, d3 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x double> %b.real, %a.imag + %1 = fadd fast <2 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_add_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #48 +; CHECK-NEXT: vadd.f64 d1, d1, d2 +; CHECK-NEXT: vsub.f64 d0, d0, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: vadd.f64 d3, d3, d8 +; CHECK-NEXT: vsub.f64 d2, d2, d9 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #80 +; CHECK-NEXT: vadd.f64 d9, d9, d4 +; CHECK-NEXT: vsub.f64 d8, d8, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vadd.f64 d11, d5, d6 +; CHECK-NEXT: vsub.f64 d10, d4, d7 +; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x double> %b.real, %a.imag + %1 = fadd fast <4 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +; Expected to not transform +define arm_aapcs_vfpcc <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_mul_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.f64 d5, d3, d0 +; CHECK-NEXT: vmul.f64 d4, d1, d3 +; CHECK-NEXT: vfma.f64 d5, d2, d1 +; CHECK-NEXT: vfnms.f64 d4, d2, d0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %0 = fmul fast <1 x double> %b.imag, %a.real + %1 = fmul fast <1 x double> %b.real, %a.imag + %2 = fadd fast <1 x double> %1, %0 + %3 = fmul fast <1 x double> %b.real, %a.real + %4 = fmul fast <1 x double> %a.imag, %b.imag + %5 = fsub fast <1 x double> %3, %4 + %interleaved.vec = shufflevector <1 x double> %5, <1 x double> %2, <2 x i32> + ret <2 x double> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_mul_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmul.f64 d9, d7, d2 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmul.f64 d8, d3, d7 +; CHECK-NEXT: vfma.f64 d9, d6, d3 +; CHECK-NEXT: vfnms.f64 d8, d6, d2 +; CHECK-NEXT: vmul.f64 d1, d5, d10 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmul.f64 d0, d11, d5 +; CHECK-NEXT: vfma.f64 d1, d4, d11 +; CHECK-NEXT: vfnms.f64 d0, d4, d10 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %b.imag, %a.real + %1 = fmul fast <2 x double> %b.real, %a.imag + %2 = fadd fast <2 x double> %1, %0 + %3 = fmul fast <2 x double> %b.real, %a.real + %4 = fmul fast <2 x double> %a.imag, %b.imag + %5 = fsub fast <2 x double> %3, %4 + %interleaved.vec = shufflevector <2 x double> %5, <2 x double> %2, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_mul_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: add r0, sp, #128 +; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #160 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov q6, q0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: add r0, sp, #176 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d5, d3, d0 +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d4, d1, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: add r0, sp, #144 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d11, d3, d0 +; CHECK-NEXT: vmul.f64 d10, d1, d3 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmul.f64 d7, d9, d12 +; CHECK-NEXT: vmul.f64 d2, d15, d1 +; CHECK-NEXT: vmul.f64 d3, d1, d14 +; CHECK-NEXT: vmul.f64 d6, d13, d9 +; CHECK-NEXT: vfma.f64 d7, d8, d13 +; CHECK-NEXT: vfnms.f64 d6, d8, d12 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d3, d0, d15 +; CHECK-NEXT: vfnms.f64 d2, d0, d14 +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d5, d0, d9 +; CHECK-NEXT: vfnms.f64 d4, d0, d8 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d11, d0, d9 +; CHECK-NEXT: vfnms.f64 d10, d0, d8 +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %0 = fmul fast <4 x double> %b.imag, %a.real + %1 = fmul fast <4 x double> %b.real, %a.imag + %2 = fadd fast <4 x double> %1, %0 + %3 = fmul fast <4 x double> %b.real, %a.real + %4 = fmul fast <4 x double> %a.imag, %b.imag + %5 = fsub fast <4 x double> %3, %4 + %interleaved.vec = shufflevector <4 x double> %5, <4 x double> %2, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll @@ -7,32 +7,10 @@ define arm_aapcs_vfpcc <4 x float> @mul_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10} -; CHECK-NEXT: vpush {d8, d9, d10} -; CHECK-NEXT: vmov.f32 s16, s1 -; CHECK-NEXT: vmov.f32 s20, s5 -; CHECK-NEXT: vmov.f32 s17, s3 -; CHECK-NEXT: vmov.f32 s21, s7 -; CHECK-NEXT: vmul.f32 q3, q5, q4 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vneg.f32 q3, q3 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vfma.f32 q3, q1, q0 -; CHECK-NEXT: vmul.f32 q0, q5, q0 -; CHECK-NEXT: vfma.f32 q0, q1, q4 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s17, s11 -; CHECK-NEXT: vmul.f32 q1, q3, q4 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vfma.f32 q1, q0, q2 -; CHECK-NEXT: vmul.f32 q0, q0, q4 -; CHECK-NEXT: vneg.f32 q4, q0 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vfma.f32 q4, q2, q3 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s0, s16 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vpop {d8, d9, d10} +; CHECK-NEXT: vcmul.f32 q3, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q3, q0, q1, #90 +; CHECK-NEXT: vcmul.f32 q0, q3, q2, #0 +; CHECK-NEXT: vcmla.f32 q0, q3, q2, #90 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -167,30 +145,11 @@ define arm_aapcs_vfpcc <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: mul_triangle: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmul.f32 q4, q3, q2 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vfma.f32 q4, q1, q0 -; CHECK-NEXT: vmul.f32 q1, q1, q3 -; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q2, q0 -; CHECK-NEXT: vmul.f32 q2, q4, q0 -; CHECK-NEXT: vfma.f32 q2, q1, q3 -; CHECK-NEXT: vmul.f32 q3, q4, q3 -; CHECK-NEXT: vneg.f32 q3, q3 -; CHECK-NEXT: vfma.f32 q3, q1, q0 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s0, s12 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vcmul.f32 q2, q1, q0, #0 +; CHECK-NEXT: vcmla.f32 q2, q1, q0, #90 +; CHECK-NEXT: vcmul.f32 q1, q0, q2, #0 +; CHECK-NEXT: vcmla.f32 q1, q0, q2, #90 +; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -219,48 +178,17 @@ define arm_aapcs_vfpcc <4 x float> @mul_diamond(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) { ; CHECK-LABEL: mul_diamond: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vmov.f32 s16, s1 -; CHECK-NEXT: vmov.f32 s28, s5 -; CHECK-NEXT: vmov.f32 s17, s3 -; CHECK-NEXT: vmov.f32 s29, s7 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmul.f32 q5, q7, q4 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmul.f32 q6, q4, q1 -; CHECK-NEXT: vneg.f32 q5, q5 -; CHECK-NEXT: vfma.f32 q6, q0, q7 -; CHECK-NEXT: vmov.f32 s28, s13 -; CHECK-NEXT: vfma.f32 q5, q0, q1 -; CHECK-NEXT: vmov.f32 s29, s15 -; CHECK-NEXT: vmul.f32 q1, q6, q7 -; CHECK-NEXT: vmov.f32 s13, s14 -; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q3, q5 -; CHECK-NEXT: vmul.f32 q5, q5, q7 -; CHECK-NEXT: vfma.f32 q5, q3, q6 -; CHECK-NEXT: vmov.f32 s24, s9 -; CHECK-NEXT: vmov.f32 s25, s11 -; CHECK-NEXT: vmul.f32 q3, q6, q4 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vneg.f32 q7, q3 -; CHECK-NEXT: vfma.f32 q7, q2, q0 -; CHECK-NEXT: vmul.f32 q2, q2, q4 -; CHECK-NEXT: vfma.f32 q2, q6, q0 -; CHECK-NEXT: vmul.f32 q3, q7, q5 -; CHECK-NEXT: vmul.f32 q0, q2, q5 -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vneg.f32 q2, q0 -; CHECK-NEXT: vmov.f32 s1, s12 -; CHECK-NEXT: vfma.f32 q2, q7, q1 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vmov.f32 s0, s8 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: vpop {d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcmul.f32 q4, q1, q0, #0 +; CHECK-NEXT: vcmla.f32 q4, q1, q0, #90 +; CHECK-NEXT: vcmul.f32 q1, q4, q3, #0 +; CHECK-NEXT: vcmla.f32 q1, q4, q3, #90 +; CHECK-NEXT: vcmul.f32 q3, q2, q0, #0 +; CHECK-NEXT: vcmla.f32 q3, q2, q0, #90 +; CHECK-NEXT: vcmul.f32 q0, q3, q1, #0 +; CHECK-NEXT: vcmla.f32 q0, q3, q1, #90 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -303,37 +231,14 @@ define arm_aapcs_vfpcc <4 x float> @mul_add90_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: mul_add90_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s16, s4 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s17, s6 -; CHECK-NEXT: vmov.f32 s24, s5 -; CHECK-NEXT: vmul.f32 q5, q4, q3 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s25, s7 -; CHECK-NEXT: vfma.f32 q5, q6, q0 -; CHECK-NEXT: vmul.f32 q6, q6, q3 -; CHECK-NEXT: vmov.f32 s4, s9 -; CHECK-NEXT: vneg.f32 q6, q6 -; CHECK-NEXT: vmov.f32 s5, s11 -; CHECK-NEXT: vfma.f32 q6, q4, q0 -; CHECK-NEXT: vmul.f32 q7, q1, q3 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vneg.f32 q7, q7 -; CHECK-NEXT: vfma.f32 q6, q2, q3 -; CHECK-NEXT: vfma.f32 q7, q2, q0 -; CHECK-NEXT: vfma.f32 q6, q1, q0 -; CHECK-NEXT: vsub.f32 q5, q7, q5 -; CHECK-NEXT: vmov.f32 s1, s24 -; CHECK-NEXT: vmov.f32 s0, s20 -; CHECK-NEXT: vmov.f32 s2, s21 -; CHECK-NEXT: vmov.f32 s3, s25 -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: vpop {d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcmul.f32 q3, q2, q0, #0 +; CHECK-NEXT: vcmul.f32 q4, q1, q0, #0 +; CHECK-NEXT: vcmla.f32 q4, q1, q0, #90 +; CHECK-NEXT: vcmla.f32 q3, q2, q0, #90 +; CHECK-NEXT: vcadd.f32 q0, q3, q4, #90 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll --- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll @@ -7,25 +7,9 @@ define arm_aapcs_vfpcc <4 x float> @simple_mul(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: simple_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s0 -; CHECK-NEXT: vmov.f32 s16, s5 -; CHECK-NEXT: vmov.f32 s13, s2 -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmul.f32 q2, q4, q3 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vfma.f32 q2, q1, q0 -; CHECK-NEXT: vmul.f32 q0, q0, q4 -; CHECK-NEXT: vneg.f32 q4, q0 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vfma.f32 q4, q1, q3 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s0, s16 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -84,37 +68,10 @@ define arm_aapcs_vfpcc <4 x float> @three_way_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: three_way_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d10, d11} -; CHECK-NEXT: vpush {d10, d11} -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} -; CHECK-NEXT: vmov.f32 s16, s4 -; CHECK-NEXT: vmov.f32 s20, s1 -; CHECK-NEXT: vmov.f32 s17, s6 -; CHECK-NEXT: vmov.f32 s21, s3 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmul.f32 q3, q5, q4 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vfma.f32 q3, q1, q0 -; CHECK-NEXT: vmul.f32 q1, q1, q5 -; CHECK-NEXT: vneg.f32 q5, q1 -; CHECK-NEXT: vfma.f32 q5, q4, q0 -; CHECK-NEXT: vmov.f32 s0, s8 -; CHECK-NEXT: vmov.f32 s1, s10 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vmul.f32 q1, q3, q0 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vfma.f32 q1, q5, q2 -; CHECK-NEXT: vmul.f32 q2, q3, q2 -; CHECK-NEXT: vneg.f32 q2, q2 -; CHECK-NEXT: vfma.f32 q2, q5, q0 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s0, s8 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vpop {d8} -; CHECK-NEXT: vpop {d10, d11} +; CHECK-NEXT: vcmul.f32 q3, q1, q0, #0 +; CHECK-NEXT: vcmla.f32 q3, q1, q0, #90 +; CHECK-NEXT: vcmul.f32 q0, q2, q3, #0 +; CHECK-NEXT: vcmla.f32 q0, q2, q3, #90 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -143,19 +100,8 @@ define arm_aapcs_vfpcc <4 x float> @simple_add_90(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: simple_add_90: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vadd.f32 q2, q3, q2 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vsub.f32 q1, q1, q0 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s0, s4 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vcadd.f32 q2, q1, q0, #90 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -201,19 +147,8 @@ define arm_aapcs_vfpcc <4 x float> @simple_add_270_true(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: simple_add_270_true: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s12, s1 -; CHECK-NEXT: vmov.f32 s4, s5 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s5, s7 -; CHECK-NEXT: vsub.f32 q2, q3, q2 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vadd.f32 q1, q0, q1 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s0, s4 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vcadd.f32 q2, q0, q1, #270 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -260,26 +195,9 @@ define arm_aapcs_vfpcc <4 x float> @mul_mul_with_fneg(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: mul_mul_with_fneg: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s8, s5 -; CHECK-NEXT: vmov.f32 s12, s0 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s13, s2 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmul.f32 q4, q3, q2 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vfma.f32 q4, q1, q0 -; CHECK-NEXT: vmul.f32 q1, q3, q1 -; CHECK-NEXT: vneg.f32 q1, q1 -; CHECK-NEXT: vfma.f32 q1, q2, q0 -; CHECK-NEXT: vneg.f32 q2, q4 -; CHECK-NEXT: vmov.f32 s0, s4 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vcmul.f32 q2, q1, q0, #270 +; CHECK-NEXT: vcmla.f32 q2, q1, q0, #180 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32>