diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -0,0 +1,45 @@ +//===- ComplexDeinterleavingPass.h - Complex Deinterleaving Pass *- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements generation of target-specific intrinsics to support +// handling of complex number arithmetic and deinterleaving. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H +#define LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H + +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" + +namespace llvm { + +class Function; +class TargetMachine; + +struct ComplexDeinterleavingPass + : public PassInfoMixin { +private: + TargetMachine *TM; + +public: + ComplexDeinterleavingPass(TargetMachine *TM) : TM(TM) {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +enum class ComplexDeinterleavingOperation { None, CAdd, CMulPartial, + // The following operations are used + // to represent internal states. Backends + // are not expected to try and support + // these in any capacity. + _Incomplete = 100, _Placeholder = 101 }; + +} // namespace llvm + +#endif // LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -79,6 +79,10 @@ /// matching during instruction selection. FunctionPass *createCodeGenPreparePass(); + /// This pass implements generation of target-specific intrinsics to support + /// handling of complex number arithmetic + FunctionPass *createComplexDeinterleavingPass(const TargetMachine *TM); + /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg /// load-linked/store-conditional loops. extern char &AtomicExpandID; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -28,6 +28,7 @@ #include "llvm/ADT/STLArrayExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/ComplexDeinterleavingPass.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/LowLevelType.h" @@ -3063,6 +3064,27 @@ return isOperationLegalOrCustom(Op, VT); } + /// Does this target support complex deinterleaving + virtual bool isComplexDeinterleavingSupported() const { return false; } + + /// Does this target support complex deinterleaving with the given operation + /// and type + virtual bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + return false; + } + + /// Create the IR node for the given complex deinterleaving operation. + /// If one cannot be created using all the given inputs, nullptr should be + /// returned. + virtual Value * + createComplexDeinterleavingIR(Instruction *I, + ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const { + return nullptr; + } + //===--------------------------------------------------------------------===// // Runtime Library hooks // diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -108,6 +108,7 @@ void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); void initializeCheckDebugMachineModulePass(PassRegistry &); void initializeCodeGenPreparePass(PassRegistry&); +void initializeComplexDeinterleavingLegacyPassPass(PassRegistry&); void initializeConstantHoistingLegacyPassPass(PassRegistry&); void initializeConstantMergeLegacyPassPass(PassRegistry&); void initializeConstraintEliminationPass(PassRegistry &); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -43,6 +43,7 @@ CodeGenPassBuilder.cpp CodeGenPrepare.cpp CommandFlags.cpp + ComplexDeinterleavingPass.cpp CriticalAntiDepBreaker.cpp DeadMachineInstructionElim.cpp DetectDeadLanes.cpp diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -0,0 +1,868 @@ +//===- ComplexDeinterleavingPass.cpp --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Identification: +// This step is responsible for finding the patterns that can be lowered to +// complex instructions. Starting from the "Converging Shuffle" (that is, a +// shuffle that reinterleaves the complex components, with a mask of +// <0, 2, 1, 3>), the operands are evaluated and identified as "Composite Nodes" +// (collections of instructions that can potentially be lowered to a single +// complex instruction). This is performed by checking the real and imaginary +// components in parallel, and tracking the data flow for each component while +// following the operand pairs. +// +// Replacement: +// This step performs the necessary input wrangling (chasing values through +// accumulators, shuffles, and other composite nodes) in order for the target to +// know what to generate. While some additional checks are performed at this +// step, it is expected to finish successfully, while any errors should be +// caught via asserts. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/ComplexDeinterleavingPass.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Local.h" +#include + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "complex-deinterleaving" + +STATISTIC(NumComplexTransformations, "Amount of complex patterns transformed"); + +static cl::opt ComplexArithmeticEnabled( + "enable-complex-arithmetic", + cl::desc("Enable generation of complex arithmetic instructions"), + cl::init(true), cl::Hidden); + +/// Checks the given mask, and determines whether said mask is interleaving. +/// +/// To be interleaving, a mask must alternate between `i` and `i + (Length / +/// 2)`, and must contain all numbers within the range of `[0..Length)` (e.g. a +/// 4x vector interleaving mask would be <0, 2, 1, 3>). +static bool isInterleavingMask(ArrayRef Mask); +/// Checks the given mask, and determines whether said mask is deinterleaving. +/// +/// To be interleaving, a mask must increment in steps of 2, and either start +/// with 0 or 1. +/// (e.g. an 8x vector deinterleaving mask would be either <0, 2, 4, 6> or +/// <1, 3, 5, 7>). +static bool isDeinterleavingMask(ArrayRef Mask); + +namespace { + +class ComplexDeinterleavingLegacyPass : public FunctionPass { +public: + static char ID; + + ComplexDeinterleavingLegacyPass(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) { + initializeComplexDeinterleavingLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "Complex Arithmetic Pass"; } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } + +private: + const TargetMachine *TM; +}; + +enum OperatingComponent { Real, Imaginary, Unknown }; + +class ComplexDeinterleavingGraph; +struct ComplexDeinterleavingCompositeNode { + + ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op) + : Operation(Op) {} + +private: + friend class ComplexDeinterleavingGraph; + using NodePtr = std::shared_ptr; + using RawNodePtr = ComplexDeinterleavingCompositeNode*; + +public: + SmallVector ContainedInstructions; + Value *OutputNode = nullptr; + Value *ReplacementNode = nullptr; + ComplexDeinterleavingOperation Operation; + + unsigned Rotation = 0; + NodePtr AccumulatorNode = nullptr; + SmallVector Operands; + + struct NodeUse { + RawNodePtr User; + unsigned Slot; + }; + SmallVector Uses; + + void addInstruction(Instruction *I) { ContainedInstructions.push_back(I); } + void addOperand(NodePtr Node) { + Node->addUse(this, Operands.size()); + Operands.push_back(Node); + } + + void addUse(RawNodePtr Node, unsigned Slot) { + Uses.push_back({Node, Slot}); + } + + void replaceUsesWith(NodePtr NewUse) { + for (auto &Use : Uses) + Use.User->Operands[Use.Slot] = NewUse; + Uses.clear(); + } + + bool hasUses() { + return Uses.size() > 0; + } + + bool contains(Value *V) { + if (V == ReplacementNode) + return true; + + return llvm::find(ContainedInstructions, V) != ContainedInstructions.end(); + } + + void dump() { + dump(dbgs()); + } + void dump(raw_ostream &OS) { + auto PrintValue = [&](Value *V){ + if(V) { + OS << "\""; + V->print(OS, true); + OS << "\"\n"; + } else OS << "nullptr\n"; + }; + auto PrintNodeRef = [&](NodePtr Ptr) { + if(Ptr.get()) + OS << Ptr.get() << "\n"; + else OS << "nullptr\n"; + }; + + OS << "- CompositeNode: " << this << "\n"; + OS << " OutputNode: "; PrintValue(OutputNode); + OS << " ReplacementNode: "; PrintValue(ReplacementNode); + OS << " AccumulatorNode: "; PrintNodeRef(AccumulatorNode); + OS << " Operation: " << (int) Operation << "\n"; + OS << " Rotation: " << Rotation << "\n"; + OS << " Operands: \n"; + for (const auto &Op : Operands) { + OS << " - "; PrintNodeRef(Op); + } + OS << " ContainedInstructions:\n"; + for (const auto &I : ContainedInstructions) { + OS << " - \""; + I->print(OS, true); + OS << "\"\n"; + } + } +}; + +class ComplexDeinterleavingGraph { +public: + using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr; + explicit ComplexDeinterleavingGraph(const TargetLowering *tl) : TL(tl) {} + +private: + const TargetLowering *TL; + Instruction *RootValue; + NodePtr RootNode; + SmallVector CompositeNodes; + + struct OperandPair { + Instruction *Real; + Instruction *Imaginary; + bool Valid = true; + }; + + OperandPair getOperandPair(Value *Left, Value *Right) { + auto LeftComp = getOperatingComponentOfValue(Left); + auto RightComp = getOperatingComponentOfValue(Right); + + OperandPair P; + + if (LeftComp == OperatingComponent::Unknown || + RightComp == OperatingComponent::Unknown) { + P.Valid = false; + return P; + } + + Instruction *LeftI = cast(Left); + Instruction *RightI = cast(Right); + + if (LeftComp == OperatingComponent::Real) { + P.Real = LeftI; + P.Imaginary = RightI; + } else { + P.Real = RightI; + P.Imaginary = LeftI; + } + + return P; + } + + bool hasIncompleteNodes() { + for (const auto &Node : CompositeNodes) { + if(Node->Operation != ComplexDeinterleavingOperation::_Incomplete) + continue; + + if(Node->hasUses()) + return true; + } + return false; + } + + void replaceIncompleteNodes() { + SmallVector NewNodes; + for (auto &Node : CompositeNodes) { + if(Node->Operation != ComplexDeinterleavingOperation::_Incomplete) + continue; + + Value *V; + if(Node->ReplacementNode) + V = Node->ReplacementNode; + else V = Node->OutputNode; + + if(isa(V) || !isa(V)) { + Node->Operation = ComplexDeinterleavingOperation::_Placeholder; + continue; + } + + if(auto Containing = getContainingComposite(V)) { + if(Node != Containing) { + Node->replaceUsesWith(Containing); + continue; + } + } + } + } + + NodePtr wrapValueAsNode(Value *V) { + auto *I = dyn_cast(V); + + auto PlaceholderNode = prepareCompositeNode(llvm::ComplexDeinterleavingOperation::_Incomplete); + if(I) + PlaceholderNode->addInstruction(I); + PlaceholderNode->OutputNode = V; + PlaceholderNode->ReplacementNode = V; + + if(auto *Shuffle = dyn_cast(PlaceholderNode->ReplacementNode)) + PlaceholderNode->ReplacementNode = Shuffle->getOperand(0); + + return submitCompositeNode(PlaceholderNode); + } + + /// Determines the operating component of the given Value by looking at the + /// operating component of the Value's operands and, based on the instruction, + /// evaluates what the resulting component would be. + OperatingComponent getOperatingComponentOfValue(Value *V) { + Instruction *I = dyn_cast_or_null(V); + if (!I) + return Unknown; + + if (auto *Shuffle = dyn_cast(I)) { + auto ShuffleMask = Shuffle->getShuffleMask(); + if (isDeinterleavingMask(ShuffleMask)) + return (OperatingComponent)ShuffleMask[0]; + return Unknown; + } + + auto OpcI = I->getOpcode(); + + if (OpcI == Instruction::FMul) { + auto Op0Component = getOperatingComponentOfValue(I->getOperand(0)); + auto Op1Component = getOperatingComponentOfValue(I->getOperand(1)); + if (Op0Component == Unknown || Op1Component == Unknown) + return Unknown; + if (Op0Component == Op1Component) + return Real; + return Imaginary; + } + + if (OpcI == Instruction::FNeg) + return getOperatingComponentOfValue(I->getOperand(0)); + + if (OpcI == Instruction::FAdd || OpcI == Instruction::FSub) + return getOperatingComponentOfValue(I->getOperand(0)); + + return Unknown; + } + + NodePtr + prepareCompositeNode(ComplexDeinterleavingOperation Operation) { + return std::make_shared(Operation); + } + + NodePtr submitCompositeNode(NodePtr Node) { + CompositeNodes.push_back(Node); + return Node; + } + + NodePtr getContainingComposite(Value *V) { + if (V == nullptr) + return nullptr; + + auto Filter = [&](NodePtr N) -> bool { + return N->Operation != llvm::ComplexDeinterleavingOperation::_Incomplete + && N->Operation != ComplexDeinterleavingOperation::_Placeholder; + }; + + for (const auto &CN : CompositeNodes) { + if (CN->contains(V) && Filter(CN)) + return CN; + if (CN->ReplacementNode == V && Filter(CN)) + return CN; + } + return nullptr; + } + + /// Identifies a complex partial multiply pattern and its rotation, based on + /// the following patterns + /// + /// 0: r: cr + ar * br + /// i: ci + ar * bi + /// 90: r: cr - ai * bi + /// i: ci + ai * br + /// 180: r: cr - ar * br + /// i: ci - ar * bi + /// 270: r: cr + ai * bi + /// i: ci - ai * br + NodePtr identifyPartialMul(Instruction *RealI, Instruction *ImagI); + + /// Identifies a complex add pattern and its rotation, based on the following + /// patterns. + /// + /// 90: r: sub = ar, bi + /// i: add = ai, br + /// 270: r: add = ar, bi + /// i: sub = ai, br + NodePtr identifyAdd(Instruction *RealI, Instruction *ImagI); + + NodePtr identifyNode(Instruction *I, Instruction *J); + NodePtr identifyNodeWithImplicitAdd(Instruction *I, Instruction *J, + Instruction *&CommonOperandI); + + Value *replaceNode(NodePtr Node); + + void dump() { + dump(dbgs()); + } + void dump(raw_ostream &OS) { + for (const auto &Node : CompositeNodes) + Node->dump(OS); + } + +public: + /// Returns false if the deinterleaving operation should be cancelled for the + /// current graph. + bool identifyNodes(Instruction *RootI); + /// Perform the actual replacement of the underlying instruction graph. + /// Returns false if the deinterleaving operation should be cancelled for the + /// current graph. + void replaceNodes(); +}; + + +class ComplexDeinterleaving { +public: + ComplexDeinterleaving(const TargetLowering *tl, const TargetLibraryInfo *tli) + : TL(tl), TLI(tli) {} + bool runOnFunction(Function &F); + +private: + bool evaluateBasicBlock(BasicBlock *B); + + const TargetLowering *TL = nullptr; + const TargetLibraryInfo *TLI = nullptr; +}; + +} // namespace + +char ComplexDeinterleavingLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, + "Complex Deinterleaving", false, false) +INITIALIZE_PASS_END(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, + "Complex Deinterleaving", false, false) + +PreservedAnalyses ComplexDeinterleavingPass::run(Function &F, + FunctionAnalysisManager &AM) { + const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + auto &TLI = AM.getResult(F); + if (!ComplexDeinterleaving(TL, &TLI).runOnFunction(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} + +FunctionPass *llvm::createComplexDeinterleavingPass(const TargetMachine *TM) { + return new ComplexDeinterleavingLegacyPass(TM); +} + +bool ComplexDeinterleavingLegacyPass::runOnFunction(Function &F) { + const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + auto TLI = getAnalysis().getTLI(F); + return ComplexDeinterleaving(TL, &TLI).runOnFunction(F); +} + +bool ComplexDeinterleaving::runOnFunction(Function &F) { + if (!ComplexArithmeticEnabled) { + LLVM_DEBUG(dbgs() << "Complex has been explicitly disabled.\n"); + return false; + } + + if (!TL->isComplexDeinterleavingSupported()) { + LLVM_DEBUG(dbgs() << "Complex has been disabled, target does not support " + "lowering of complex numbers.\n"); + return false; + } + + bool Changed = false; + for (auto &B : F) + Changed |= evaluateBasicBlock(&B); + + return Changed; +} + +static bool isInterleavingMask(ArrayRef Mask) { + + if ((Mask.size() & 1) == 1) + return false; + + int HalfNumElements = Mask.size() / 2; + for (int Idx = 0; Idx < HalfNumElements; ++Idx) { + int MaskIdx = Idx * 2; + if (Mask[MaskIdx] != Idx || Mask[MaskIdx + 1] != (Idx + HalfNumElements)) + return false; + } + + return true; +} + +static bool isDeinterleavingMask(ArrayRef Mask) { + int Offset = Mask[0]; + int HalfNumElements = Mask.size() / 2; + + for (int Idx = 1; Idx < HalfNumElements; ++Idx) { + if (Mask[Idx] != (Idx * 2) + Offset) + return false; + } + + return true; +} + +bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) { + bool Changed = false; + + SmallVector DeadInstrRoots; + + for (auto &I : *B) { + if (auto *SVI = dyn_cast(&I)) { + // Look for a shufflevector that takes separate vectors of the real and + // imaginary components and recombines them into a single vector. + if (isInterleavingMask(SVI->getShuffleMask())) { + ComplexDeinterleavingGraph Graph(TL); + if (Graph.identifyNodes(SVI)) { + Graph.replaceNodes(); + Changed = true; + DeadInstrRoots.push_back(SVI); + } + } + } + } + + for (const auto &I : DeadInstrRoots) { + if (!I || I->getParent() == nullptr) + continue; + llvm::RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + } + + return Changed; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyNodeWithImplicitAdd( + Instruction *RealI, Instruction *ImagI, + Instruction *&CommonOperandI) { + + if (auto CN = getContainingComposite(RealI)) + return CN; + if (auto CN = getContainingComposite(ImagI)) + return CN; + + if (RealI->getOpcode() != Instruction::FMul || + ImagI->getOpcode() != Instruction::FMul) + return nullptr; + unsigned Rotation = 0; + + Value *R0 = RealI->getOperand(0); + Value *R1 = RealI->getOperand(1); + Value *I0 = ImagI->getOperand(0); + Value *I1 = ImagI->getOperand(1); + + Value *CommonOperand = nullptr; + Value *UncommonRealOp; + + if (R0 == I0 || R0 == I1) { + CommonOperand = R0; + UncommonRealOp = R1; + } else if (R1 == I0 || R1 == I1) { + CommonOperand = R1; + UncommonRealOp = R0; + } + + if (CommonOperand == nullptr) + return nullptr; + + if (auto *I = dyn_cast_or_null(CommonOperand)) { + if (I->getOpcode() == Instruction::FNeg) + Rotation += 180; + } + + auto *UncommonImagOp = CommonOperand == I0 ? I1 : I0; + + auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::CMulPartial); + Node->addInstruction(RealI); + Node->addInstruction(ImagI); + Node->Rotation = Rotation; + Node->OutputNode = RealI; + Node->addOperand(wrapValueAsNode(CommonOperand)); + + auto UncommonNode = identifyNode(cast(UncommonRealOp), + cast(UncommonImagOp)); + if(!UncommonNode) + UncommonNode = wrapValueAsNode(UncommonRealOp); + Node->addOperand(UncommonNode); + CommonOperandI = dyn_cast(CommonOperand); + + return submitCompositeNode(Node); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyPartialMul(Instruction *RealI, + Instruction *ImagI) { + // Determine rotation + unsigned Rotation = 360; + if (RealI->getOpcode() == Instruction::FAdd && + ImagI->getOpcode() == Instruction::FAdd) + Rotation = 0; + else if (RealI->getOpcode() == Instruction::FSub && + ImagI->getOpcode() == Instruction::FAdd) + Rotation = 90; + else if (RealI->getOpcode() == Instruction::FSub && + ImagI->getOpcode() == Instruction::FSub) + Rotation = 180; + else if (RealI->getOpcode() == Instruction::FAdd && + ImagI->getOpcode() == Instruction::FSub) + Rotation = 270; + + if (Rotation == 360) { + LLVM_DEBUG(dbgs() << "Unhandled case, rotation is not assigned.\n"); + return nullptr; + } + + Value *CR = nullptr; + Value *CI = nullptr; + + CR = RealI->getOperand(0); + Instruction *RealMulI = dyn_cast(RealI->getOperand(1)); + if (!RealMulI) + return nullptr; + CI = ImagI->getOperand(0); + Instruction *ImagMulI = dyn_cast(ImagI->getOperand(1)); + if (!ImagMulI) + return nullptr; + + if (!RealMulI->hasOneUse() || !ImagMulI->hasOneUse()) + return nullptr; + + Value *R0 = RealMulI->getOperand(0); + Value *R1 = RealMulI->getOperand(1); + Value *I0 = ImagMulI->getOperand(0); + Value *I1 = ImagMulI->getOperand(1); + + Value *CommonOperand = nullptr; + Value *UncommonRealOp; + Value *UncommonImagOp; + + if (R0 == I0 || R0 == I1) { + CommonOperand = R0; + UncommonRealOp = R1; + } else if (R1 == I0 || R1 == I1) { + CommonOperand = R1; + UncommonRealOp = R0; + } + + if (!CommonOperand) + return nullptr; + + UncommonImagOp = (CommonOperand == I0) ? I1 : I0; + + auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::CMulPartial); + Node->addInstruction(RealI); + Node->addInstruction(ImagI); + Node->addInstruction(RealMulI); + Node->addInstruction(ImagMulI); + Node->Rotation = Rotation; + Node->OutputNode = RealI; + + Instruction *OtherCommonOperand = nullptr; + auto CNode = identifyNodeWithImplicitAdd( + cast(CR), cast(CI), OtherCommonOperand); + + if (CNode) { + // Check rotation pairs for full complex multiplication, certain rotation + // pairs are not valid + if ((Node->Rotation == 90 && CNode->Rotation != 0) || + (Node->Rotation == 270 && CNode->Rotation != 180)) { + return nullptr; + } + if ((Node->Rotation == 0 && CNode->Rotation != 90) || + (Node->Rotation == 180 && CNode->Rotation != 270)) { + return nullptr; + } + + Node->AccumulatorNode = CNode; + } + + if (OtherCommonOperand) { + auto Pair = getOperandPair(CommonOperand, OtherCommonOperand); + if(!Pair.Valid) + return nullptr; + auto Res = identifyNode(Pair.Real, Pair.Imaginary); + if(!Res) + return nullptr; + Node->addOperand(Res); + }else + Node->addOperand(wrapValueAsNode(CommonOperand)); + + auto Pair = getOperandPair(UncommonRealOp, UncommonImagOp); + if(!Pair.Valid) + return nullptr; + + auto Res = identifyNode(Pair.Real, Pair.Imaginary); + if(!Res) + return nullptr; + Node->addOperand(Res); + + return submitCompositeNode(Node); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyAdd(Instruction *RealI, Instruction *ImagI) { + // Determine rotation + unsigned Rotation = 0; + if (RealI->getOpcode() == Instruction::FSub && + ImagI->getOpcode() == Instruction::FAdd) + Rotation = 90; + else if (RealI->getOpcode() == Instruction::FAdd && + ImagI->getOpcode() == Instruction::FSub) + Rotation = 270; + + if (Rotation == 0) { + LLVM_DEBUG(dbgs() << "Unhandled case, rotation is not assigned.\n"); + return nullptr; + } + + auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd); + Node->addInstruction(RealI); + Node->addInstruction(ImagI); + + Node->Rotation = Rotation; + Node->OutputNode = RealI; + + auto *AR = cast(RealI->getOperand(0)); + auto *BI = cast(RealI->getOperand(1)); + auto *AI = cast(ImagI->getOperand(0)); + auto *BR = cast(ImagI->getOperand(1)); + + auto Res = identifyNode(AR, AI); + if(!Res) + return nullptr; + Node->addOperand(Res); + Res = identifyNode(BR, BI); + if(!Res) + return nullptr; + Node->addOperand(Res); + + return submitCompositeNode(Node); +} + +static bool isInstructionPairAdd(Instruction *A, Instruction *B) { + auto OpcA = A->getOpcode(); + auto OpcB = B->getOpcode(); + return (OpcA == Instruction::FSub && OpcB == Instruction::FAdd) || + (OpcA == Instruction::FAdd && OpcB == Instruction::FSub); +} + +static bool isInstructionPairMul(Instruction *A, Instruction *B) { + auto Pattern = + m_BinOp(m_FMul(m_Value(), m_Value()), m_FMul(m_Value(), m_Value())); + + return match(A, Pattern) && match(B, Pattern); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyNode(Instruction *RealI, Instruction *ImagI) { + if (auto CN = getContainingComposite(RealI)) + return CN; + if (auto CN = getContainingComposite(ImagI)) + return CN; + + auto IsZeroInitializer = [&](Value *V) -> bool { return match(V, m_Zero()); }; + + auto *RealShuffle = dyn_cast(RealI); + auto *ImagShuffle = dyn_cast(ImagI); + + if ((RealShuffle && !ImagShuffle) || (!RealShuffle && ImagShuffle)) { + LLVM_DEBUG(dbgs() << "Only one component is a shuffle.\n"); + return nullptr; + } + + // Implicit check for ImagShuffle too + if (RealShuffle) { + auto *RealOp1 = RealShuffle->getOperand(1); + if (!isa(RealOp1) && !IsZeroInitializer(RealOp1)) + return nullptr; + auto *ImagOp1 = ImagShuffle->getOperand(1); + if (!isa(ImagOp1) && !IsZeroInitializer(ImagOp1)) + return nullptr; + + auto *RealOp0 = RealShuffle->getOperand(0); + auto *ImagOp0 = ImagShuffle->getOperand(0); + + if (RealOp0 != ImagOp0) + return nullptr; + + auto RealMask = RealShuffle->getShuffleMask(); + auto ImagMask = ImagShuffle->getShuffleMask(); + if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) + return nullptr; + + if (RealMask[0] != 0 || ImagMask[0] != 1) + return nullptr; + + // Type checking, the shuffle type should be a vector type of the same + // scalar type, but half the size + auto CheckType = [&](ShuffleVectorInst *Shuffle) { + auto *Op = Shuffle->getOperand(0); + auto *ShuffleTy = cast(Shuffle->getType()); + auto *OpTy = cast(Op->getType()); + + if(OpTy->getScalarType() != ShuffleTy->getScalarType()) + return false; + if((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements()) + return false; + + return true; + }; + + if(!CheckType(RealShuffle)) + return nullptr; + if(!CheckType(ImagShuffle)) + return nullptr; + if(RealShuffle->getType() != ImagShuffle->getType()) + return nullptr; + + if(auto N = getContainingComposite(RealShuffle)) + return N; + + auto PlaceholderNode = prepareCompositeNode(llvm::ComplexDeinterleavingOperation::_Placeholder); + PlaceholderNode->addInstruction(RealShuffle); + PlaceholderNode->addInstruction(ImagShuffle); + PlaceholderNode->OutputNode = RealShuffle; + PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0); + return submitCompositeNode(PlaceholderNode); + } + + auto *VTy = cast(RealI->getType()); + auto *NewVTy = + FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + + if (TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy) && + isInstructionPairMul(RealI, ImagI)) { + return identifyPartialMul(RealI, ImagI); + } + + if (TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CAdd, NewVTy) && + isInstructionPairAdd(RealI, ImagI)) { + return identifyAdd(RealI, ImagI); + } + + return nullptr; +} + +bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { + Instruction *RealI; + Instruction *ImagI; + + if (!match(RootI, m_Shuffle(m_Instruction(RealI), m_Instruction(ImagI)))) + return false; + + RootValue = RootI; + RootNode = identifyNode(RealI, ImagI); + + replaceIncompleteNodes(); + if(hasIncompleteNodes()) + return false; + return RootNode != nullptr; +} + +Value *ComplexDeinterleavingGraph::replaceNode(ComplexDeinterleavingGraph::NodePtr Node) { + if(!Node) + return nullptr; + + if(Node->ReplacementNode) + return Node->ReplacementNode; + + auto *Input0 = replaceNode(Node->Operands[0]); + auto *Input1 = replaceNode(Node->Operands[1]); + auto *Accumulator = replaceNode(Node->AccumulatorNode); + + assert(Input0->getType() == Input1->getType() && "Node inputs need to be of the same type"); + + auto *AnchorI = cast(Node->OutputNode); + + Node->ReplacementNode = TL->createComplexDeinterleavingIR( + AnchorI, Node->Operation, Node->Rotation, Input0, Input1, + Accumulator); + + assert(Node->ReplacementNode && "Target failed to create Intrinsic call."); + + cast(Node->ReplacementNode)->moveAfter(AnchorI); + NumComplexTransformations += 1; + + return Node->ReplacementNode; +} + +void ComplexDeinterleavingGraph::replaceNodes() { + auto *R = replaceNode(RootNode); + assert(R && "Unable to find replacement for RootValue"); + RootValue->replaceAllUsesWith(R); +} \ No newline at end of file diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -734,6 +734,15 @@ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool isComplexDeinterleavingSupported() const override; + bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const override; + + Value *createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const override; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21768,3 +21768,113 @@ MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } + +bool ARMTargetLowering::isComplexDeinterleavingSupported() const { + return Subtarget->hasMVEFloatOps(); +} + +bool ARMTargetLowering::isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + auto *VTy = dyn_cast(Ty); + if (!VTy) + return false; + + auto *ScalarTy = VTy->getScalarType(); + unsigned NumElements = VTy->getNumElements(); + + unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; + if (VTyWidth < 128) + return false; + + // 32 is the length of SplitMask in createComplexDeinterleavingIR + if (NumElements > 32) + return false; + + // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32 + if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy()) + return true; + + return false; +} + +Value *ARMTargetLowering::createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { + + FixedVectorType *Ty = cast(InputA->getType()); + + IRBuilder<> B(I); + + if (Accumulator == nullptr) + Accumulator = ConstantFP::get(Ty, 0); + + unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); + + assert(TyWidth >= 128); + + if (TyWidth > 128) { + int Stride = Ty->getNumElements() / 2; + auto SplitSeq = llvm::seq(0, Ty->getNumElements()); + auto SplitSeqVec = llvm::to_vector(SplitSeq); + ArrayRef LowerSplitMask(&SplitSeqVec[0], Stride); + ArrayRef UpperSplitMask(&SplitSeqVec[Stride], Stride); + + auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask); + auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask); + auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask); + auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask); + auto *LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask); + auto *UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask); + + auto *LowerSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + auto *UpperSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + + ArrayRef JoinMask(&SplitSeqVec[0], Ty->getNumElements()); + return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); + } + + auto *IntTy = Type::getInt32Ty(B.getContext()); + + ConstantInt *ConstRotation = nullptr; + if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + + if (Rotation == 0) + ConstRotation = ConstantInt::get(IntTy, 0); + else if (Rotation == 90) + ConstRotation = ConstantInt::get(IntTy, 1); + else if (Rotation == 180) + ConstRotation = ConstantInt::get(IntTy, 2); + else if (Rotation == 270) + ConstRotation = ConstantInt::get(IntTy, 3); + + if (!ConstRotation) + return nullptr; + + if (Accumulator) + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty, + {ConstRotation, Accumulator, InputB, InputA}); + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty, + {ConstRotation, InputB, InputA}); + } + + if (OperationType == ComplexDeinterleavingOperation::CAdd) { + + // 1 means the value is not halved. + auto *ConstHalving = ConstantInt::get(IntTy, 1); + + if (Rotation == 90) + ConstRotation = ConstantInt::get(IntTy, 0); + else if (Rotation == 270) + ConstRotation = ConstantInt::get(IntTy, 1); + + if (!ConstRotation) + return nullptr; // Invalid rotation for arm_mve_vcaddq + + return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty, + {ConstHalving, ConstRotation, InputA, InputB}); + } + + return nullptr; +} diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -426,11 +426,15 @@ TargetPassConfig::addIRPasses(); // Run the parallel DSP pass. - if (getOptLevel() == CodeGenOpt::Aggressive) + if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createARMParallelDSPPass()); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexDeinterleavingPass(TM)); + // Match interleaved memory accesses to ldN/stN intrinsics. - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); // Add Control Flow Guard checks. diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Transform functions to use DSP intrinsics +; CHECK-NEXT: Complex Arithmetic Pass ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: Type Promotion ; CHECK-NEXT: CodeGen Prepare diff --git a/llvm/test/CodeGen/Thumb2/complex-arithmetic-f16-add.ll b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f16-add.ll --- a/llvm/test/CodeGen/Thumb2/complex-arithmetic-f16-add.ll +++ b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f16-add.ll @@ -55,39 +55,7 @@ define arm_aapcs_vfpcc <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: complex_add_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vins.f16 s12, s10 -; CHECK-NEXT: vmovx.f16 s13, s6 -; CHECK-NEXT: vmovx.f16 s10, s7 -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vins.f16 s13, s10 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vins.f16 s8, s1 -; CHECK-NEXT: vins.f16 s0, s10 -; CHECK-NEXT: vmovx.f16 s10, s3 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s9, s3 -; CHECK-NEXT: vins.f16 s1, s10 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vadd.f16 q2, q3, q2 -; CHECK-NEXT: vsub.f16 q1, q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s6, s4 -; CHECK-NEXT: vins.f16 s4, s8 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vins.f16 s5, s9 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov.f32 s1, s6 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s8 +; CHECK-NEXT: vcadd.f16 q0, q1, q0, #90 ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> @@ -102,72 +70,8 @@ define arm_aapcs_vfpcc <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-LABEL: complex_add_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: vmov.f32 s16, s0 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.f32 s17, s2 -; CHECK-NEXT: vins.f16 s16, s1 -; CHECK-NEXT: vmovx.f16 s1, s1 -; CHECK-NEXT: vins.f16 s17, s3 -; CHECK-NEXT: vmovx.f16 s20, s8 -; CHECK-NEXT: vmovx.f16 s18, s9 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vmovx.f16 s3, s3 -; CHECK-NEXT: vins.f16 s20, s18 -; CHECK-NEXT: vmovx.f16 s21, s10 -; CHECK-NEXT: vmovx.f16 s18, s11 -; CHECK-NEXT: vmovx.f16 s22, s12 -; CHECK-NEXT: vmovx.f16 s24, s13 -; CHECK-NEXT: vins.f16 s1, s3 -; CHECK-NEXT: vmovx.f16 s3, s5 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vmov.f32 s19, s6 -; CHECK-NEXT: vins.f16 s21, s18 -; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vins.f16 s22, s24 -; CHECK-NEXT: vmovx.f16 s23, s14 -; CHECK-NEXT: vmovx.f16 s24, s15 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmovx.f16 s3, s6 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vins.f16 s14, s15 -; CHECK-NEXT: vins.f16 s12, s13 -; CHECK-NEXT: vins.f16 s8, s9 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vins.f16 s18, s5 -; CHECK-NEXT: vins.f16 s19, s7 -; CHECK-NEXT: vins.f16 s23, s24 -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vadd.f16 q4, q5, q4 -; CHECK-NEXT: vsub.f16 q2, q2, q0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s5, s10 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s12, s9 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vmovx.f16 s7, s11 -; CHECK-NEXT: vins.f16 s11, s19 -; CHECK-NEXT: vins.f16 s12, s0 -; CHECK-NEXT: vmovx.f16 s0, s19 -; CHECK-NEXT: vins.f16 s8, s16 -; CHECK-NEXT: vins.f16 s10, s18 -; CHECK-NEXT: vins.f16 s9, s17 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vmov.f32 s3, s12 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: vcadd.f16 q0, q2, q0, #90 +; CHECK-NEXT: vcadd.f16 q1, q3, q1, #90 ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> @@ -182,144 +86,21 @@ define arm_aapcs_vfpcc <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) { ; CHECK-LABEL: complex_add_v32f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vcadd.f16 q0, q4, q0, #90 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #48 +; CHECK-NEXT: vcadd.f16 q1, q4, q1, #90 +; CHECK-NEXT: vldrw.u32 q4, [r0] ; CHECK-NEXT: add r0, sp, #64 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vmovx.f16 s18, s1 -; CHECK-NEXT: add r0, sp, #80 -; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vmovx.f16 s17, s2 -; CHECK-NEXT: vmovx.f16 s18, s3 -; CHECK-NEXT: vldrw.u32 q7, [r0] -; CHECK-NEXT: vmov.f32 s20, s24 -; CHECK-NEXT: vins.f16 s17, s18 -; CHECK-NEXT: vmovx.f16 s18, s4 -; CHECK-NEXT: vmovx.f16 s22, s5 -; CHECK-NEXT: vmovx.f16 s19, s6 -; CHECK-NEXT: vmovx.f16 s23, s7 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmovx.f16 s1, s25 -; CHECK-NEXT: vmovx.f16 s24, s24 -; CHECK-NEXT: vmov.f32 s21, s26 -; CHECK-NEXT: vins.f16 s20, s25 -; CHECK-NEXT: vins.f16 s18, s22 -; CHECK-NEXT: vmov.f32 s22, s28 -; CHECK-NEXT: vins.f16 s19, s23 -; CHECK-NEXT: vmov.f32 s23, s30 -; CHECK-NEXT: vins.f16 s24, s1 -; CHECK-NEXT: vmovx.f16 s25, s26 -; CHECK-NEXT: vmovx.f16 s1, s27 -; CHECK-NEXT: vins.f16 s21, s27 -; CHECK-NEXT: vins.f16 s25, s1 -; CHECK-NEXT: vmovx.f16 s26, s28 -; CHECK-NEXT: vmovx.f16 s1, s29 -; CHECK-NEXT: vins.f16 s22, s29 -; CHECK-NEXT: vins.f16 s23, s31 -; CHECK-NEXT: add r0, sp, #112 -; CHECK-NEXT: vins.f16 s26, s1 -; CHECK-NEXT: vmovx.f16 s1, s31 -; CHECK-NEXT: vmovx.f16 s27, s30 -; CHECK-NEXT: vsub.f16 q4, q5, q4 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vldrw.u32 q5, [r0] -; CHECK-NEXT: vins.f16 s27, s1 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmovx.f16 s28, s8 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vmovx.f16 s6, s23 -; CHECK-NEXT: vadd.f16 q0, q6, q0 -; CHECK-NEXT: vmovx.f16 s27, s22 -; CHECK-NEXT: vmovx.f16 s4, s16 -; CHECK-NEXT: vins.f16 s16, s0 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vins.f16 s27, s6 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s5, s18 -; CHECK-NEXT: vmovx.f16 s0, s2 -; CHECK-NEXT: vins.f16 s18, s2 -; CHECK-NEXT: vmovx.f16 s26, s20 -; CHECK-NEXT: vmovx.f16 s2, s21 -; CHECK-NEXT: vins.f16 s28, s6 -; CHECK-NEXT: vmovx.f16 s29, s10 -; CHECK-NEXT: vmovx.f16 s6, s11 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmov.f32 s0, s12 -; CHECK-NEXT: vins.f16 s26, s2 -; CHECK-NEXT: vmov.f32 s2, s14 -; CHECK-NEXT: vins.f16 s29, s6 -; CHECK-NEXT: vmovx.f16 s6, s13 -; CHECK-NEXT: vmovx.f16 s30, s12 -; CHECK-NEXT: add r0, sp, #96 -; CHECK-NEXT: vins.f16 s30, s6 -; CHECK-NEXT: vins.f16 s0, s13 -; CHECK-NEXT: vins.f16 s2, s15 -; CHECK-NEXT: vmovx.f16 s6, s15 -; CHECK-NEXT: vmovx.f16 s31, s14 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vins.f16 s31, s6 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vmovx.f16 s24, s12 -; CHECK-NEXT: vmovx.f16 s6, s13 -; CHECK-NEXT: vmovx.f16 s25, s14 -; CHECK-NEXT: vins.f16 s24, s6 -; CHECK-NEXT: vmovx.f16 s6, s15 -; CHECK-NEXT: vins.f16 s14, s15 -; CHECK-NEXT: vins.f16 s22, s23 -; CHECK-NEXT: vins.f16 s20, s21 -; CHECK-NEXT: vins.f16 s12, s13 -; CHECK-NEXT: vins.f16 s8, s9 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vins.f16 s25, s6 -; CHECK-NEXT: vmov.f32 s13, s14 -; CHECK-NEXT: vmovx.f16 s7, s19 -; CHECK-NEXT: vmov.f32 s10, s0 -; CHECK-NEXT: vins.f16 s19, s3 -; CHECK-NEXT: vmov.f32 s11, s2 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vadd.f16 q2, q6, q2 -; CHECK-NEXT: vmov.f32 s15, s22 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vsub.f16 q5, q3, q7 -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vmovx.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s13, s22 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s13, s0 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vins.f16 s24, s0 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmovx.f16 s26, s21 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vins.f16 s22, s10 -; CHECK-NEXT: vmovx.f16 s15, s23 -; CHECK-NEXT: vins.f16 s23, s11 -; CHECK-NEXT: vins.f16 s26, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vins.f16 s20, s8 -; CHECK-NEXT: vins.f16 s21, s9 -; CHECK-NEXT: vins.f16 s17, s1 -; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: vins.f16 s15, s0 -; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s4, s18 -; CHECK-NEXT: vmov.f32 s12, s22 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmov.f32 s10, s21 -; CHECK-NEXT: vmov.f32 s14, s23 -; CHECK-NEXT: vmov.f32 s3, s24 -; CHECK-NEXT: vmov.f32 s11, s26 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vcadd.f16 q2, q4, q2, #90 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vcadd.f16 q3, q4, q3, #90 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/complex-arithmetic-f16-mul.ll b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f16-mul.ll --- a/llvm/test/CodeGen/Thumb2/complex-arithmetic-f16-mul.ll +++ b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f16-mul.ll @@ -73,44 +73,10 @@ define arm_aapcs_vfpcc <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-LABEL: complex_mul_v8f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmovx.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s10, s1 -; CHECK-NEXT: vins.f16 s8, s10 -; CHECK-NEXT: vmovx.f16 s9, s2 -; CHECK-NEXT: vmovx.f16 s10, s3 -; CHECK-NEXT: vmovx.f16 s12, s4 -; CHECK-NEXT: vins.f16 s9, s10 -; CHECK-NEXT: vmovx.f16 s10, s5 -; CHECK-NEXT: vmovx.f16 s13, s6 -; CHECK-NEXT: vins.f16 s12, s10 -; CHECK-NEXT: vmovx.f16 s10, s7 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vins.f16 s13, s10 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vmul.f16 q4, q3, q0 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vfma.f16 q4, q1, q2 -; CHECK-NEXT: vmul.f16 q2, q2, q3 -; CHECK-NEXT: vneg.f16 q2, q2 -; CHECK-NEXT: vfma.f16 q2, q1, q0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmovx.f16 s4, s8 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s17 -; CHECK-NEXT: vins.f16 s8, s16 -; CHECK-NEXT: vins.f16 s9, s17 -; CHECK-NEXT: vins.f16 s6, s0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vcmla.f16 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f16 q2, q0, q1, #90 ; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> @@ -130,75 +96,17 @@ define arm_aapcs_vfpcc <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-LABEL: complex_mul_v16f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmovx.f16 s19, s6 -; CHECK-NEXT: vmovx.f16 s24, s7 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vmovx.f16 s18, s1 -; CHECK-NEXT: vins.f16 s19, s24 -; CHECK-NEXT: vmovx.f16 s24, s8 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmovx.f16 s25, s10 -; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vmovx.f16 s17, s2 -; CHECK-NEXT: vmovx.f16 s18, s3 -; CHECK-NEXT: vins.f16 s25, s8 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vmovx.f16 s26, s12 -; CHECK-NEXT: vins.f16 s17, s18 -; CHECK-NEXT: vmovx.f16 s18, s4 -; CHECK-NEXT: vmovx.f16 s22, s5 -; CHECK-NEXT: vins.f16 s0, s1 -; CHECK-NEXT: vmovx.f16 s1, s9 -; CHECK-NEXT: vins.f16 s26, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 -; CHECK-NEXT: vmovx.f16 s27, s14 -; CHECK-NEXT: vins.f16 s18, s22 -; CHECK-NEXT: vins.f16 s2, s3 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vins.f16 s24, s1 -; CHECK-NEXT: vins.f16 s4, s5 -; CHECK-NEXT: vins.f16 s27, s8 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s22, s12 -; CHECK-NEXT: vins.f16 s20, s9 -; CHECK-NEXT: vmov.f32 s23, s14 -; CHECK-NEXT: vins.f16 s21, s11 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vins.f16 s22, s13 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vins.f16 s23, s15 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vmul.f16 q1, q4, q6 -; CHECK-NEXT: vmul.f16 q2, q6, q0 -; CHECK-NEXT: vneg.f16 q3, q1 -; CHECK-NEXT: vfma.f16 q3, q5, q0 -; CHECK-NEXT: vfma.f16 q2, q5, q4 -; CHECK-NEXT: vmovx.f16 s4, s12 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s5, s14 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s12, s8 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s8, s13 -; CHECK-NEXT: vmovx.f16 s0, s9 -; CHECK-NEXT: vmovx.f16 s7, s15 -; CHECK-NEXT: vins.f16 s15, s11 -; CHECK-NEXT: vins.f16 s8, s0 -; CHECK-NEXT: vmovx.f16 s0, s11 -; CHECK-NEXT: vins.f16 s14, s10 -; CHECK-NEXT: vins.f16 s13, s9 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmov q0, q3 -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vmov.f32 s3, s8 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vcmla.f16 q4, q1, q3, #0 +; CHECK-NEXT: vcmla.f16 q5, q0, q2, #0 +; CHECK-NEXT: vcmla.f16 q4, q1, q3, #90 +; CHECK-NEXT: vcmla.f16 q5, q0, q2, #90 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> @@ -218,157 +126,33 @@ define arm_aapcs_vfpcc <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) { ; CHECK-LABEL: complex_mul_v32f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r0, sp, #96 -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add r0, sp, #48 +; CHECK-NEXT: vmov.i32 q4, #0x0 ; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: add r0, sp, #112 -; CHECK-NEXT: vldrw.u32 q7, [r0] -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmovx.f16 s16, s24 -; CHECK-NEXT: vmovx.f16 s18, s25 -; CHECK-NEXT: vins.f16 s16, s18 -; CHECK-NEXT: vmovx.f16 s17, s26 -; CHECK-NEXT: vmovx.f16 s18, s27 -; CHECK-NEXT: vmovx.f16 s19, s29 -; CHECK-NEXT: vins.f16 s17, s18 -; CHECK-NEXT: vmovx.f16 s18, s28 -; CHECK-NEXT: vins.f16 s18, s19 -; CHECK-NEXT: vmovx.f16 s19, s30 -; CHECK-NEXT: vmovx.f16 s8, s31 -; CHECK-NEXT: vmov.f32 s20, s0 -; CHECK-NEXT: vins.f16 s19, s8 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vmov.f32 s21, s2 -; CHECK-NEXT: vmov.f32 s22, s4 -; CHECK-NEXT: vins.f16 s20, s1 -; CHECK-NEXT: vmov.f32 s23, s6 -; CHECK-NEXT: vins.f16 s0, s8 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vmovx.f16 s8, s3 -; CHECK-NEXT: vins.f16 s21, s3 -; CHECK-NEXT: vins.f16 s1, s8 -; CHECK-NEXT: vmovx.f16 s2, s4 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s22, s5 -; CHECK-NEXT: vins.f16 s23, s7 -; CHECK-NEXT: vmovx.f16 s4, s7 -; CHECK-NEXT: vmovx.f16 s3, s6 -; CHECK-NEXT: vins.f16 s26, s27 -; CHECK-NEXT: vins.f16 s30, s31 -; CHECK-NEXT: vins.f16 s28, s29 -; CHECK-NEXT: vins.f16 s2, s8 -; CHECK-NEXT: vins.f16 s24, s25 -; CHECK-NEXT: vmov.f32 s25, s26 -; CHECK-NEXT: vins.f16 s3, s4 -; CHECK-NEXT: vmul.f16 q2, q4, q5 -; CHECK-NEXT: vmov.f32 s26, s28 -; CHECK-NEXT: add r0, sp, #128 -; CHECK-NEXT: vmov.f32 s27, s30 -; CHECK-NEXT: vfma.f16 q2, q6, q0 -; CHECK-NEXT: vmul.f16 q0, q0, q4 -; CHECK-NEXT: vneg.f16 q4, q0 -; CHECK-NEXT: vmovx.f16 s0, s8 -; CHECK-NEXT: vfma.f16 q4, q6, q5 -; CHECK-NEXT: vmovx.f16 s20, s12 -; CHECK-NEXT: vmovx.f16 s4, s16 -; CHECK-NEXT: vmovx.f16 s5, s18 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmovx.f16 s0, s10 -; CHECK-NEXT: vins.f16 s5, s0 -; CHECK-NEXT: vmovx.f16 s0, s13 -; CHECK-NEXT: vins.f16 s16, s8 -; CHECK-NEXT: vins.f16 s18, s10 -; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s20, s0 -; CHECK-NEXT: vmovx.f16 s21, s14 -; CHECK-NEXT: vmovx.f16 s0, s15 -; CHECK-NEXT: vmovx.f16 s22, s8 -; CHECK-NEXT: vins.f16 s21, s0 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: add r0, sp, #144 -; CHECK-NEXT: vins.f16 s22, s6 -; CHECK-NEXT: vmovx.f16 s23, s10 -; CHECK-NEXT: vmovx.f16 s6, s11 -; CHECK-NEXT: vmov.f32 s24, s0 -; CHECK-NEXT: vldrw.u32 q7, [r0] -; CHECK-NEXT: vins.f16 s23, s6 -; CHECK-NEXT: vmovx.f16 s6, s1 -; CHECK-NEXT: vmovx.f16 s0, s0 -; CHECK-NEXT: vins.f16 s24, s1 -; CHECK-NEXT: vins.f16 s0, s6 -; CHECK-NEXT: vmovx.f16 s6, s3 -; CHECK-NEXT: vmovx.f16 s1, s2 -; CHECK-NEXT: vmov.f32 s25, s2 -; CHECK-NEXT: vins.f16 s1, s6 -; CHECK-NEXT: vmovx.f16 s6, s29 -; CHECK-NEXT: vmovx.f16 s2, s28 -; CHECK-NEXT: vins.f16 s25, s3 -; CHECK-NEXT: vins.f16 s2, s6 -; CHECK-NEXT: vins.f16 s8, s9 -; CHECK-NEXT: vins.f16 s10, s11 -; CHECK-NEXT: vins.f16 s14, s15 -; CHECK-NEXT: vmovx.f16 s6, s31 -; CHECK-NEXT: vmovx.f16 s3, s30 -; CHECK-NEXT: vmov.f32 s26, s28 -; CHECK-NEXT: vmov.f32 s27, s30 -; CHECK-NEXT: vins.f16 s12, s13 -; CHECK-NEXT: vmov.f32 s13, s14 -; CHECK-NEXT: vins.f16 s3, s6 -; CHECK-NEXT: vmov.f32 s14, s8 -; CHECK-NEXT: vins.f16 s26, s29 -; CHECK-NEXT: vmov.f32 s15, s10 -; CHECK-NEXT: vins.f16 s27, s31 -; CHECK-NEXT: vmul.f16 q7, q0, q3 -; CHECK-NEXT: vmul.f16 q0, q5, q0 -; CHECK-NEXT: vfma.f16 q7, q6, q5 -; CHECK-NEXT: vneg.f16 q5, q0 -; CHECK-NEXT: vfma.f16 q5, q6, q3 -; CHECK-NEXT: vmovx.f16 s0, s28 -; CHECK-NEXT: vmovx.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s13, s22 -; CHECK-NEXT: vins.f16 s6, s0 -; CHECK-NEXT: vmovx.f16 s0, s30 -; CHECK-NEXT: vins.f16 s13, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vmovx.f16 s7, s19 -; CHECK-NEXT: vmovx.f16 s0, s1 -; CHECK-NEXT: vmovx.f16 s26, s21 -; CHECK-NEXT: vins.f16 s24, s0 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vins.f16 s7, s0 -; CHECK-NEXT: vmovx.f16 s0, s29 -; CHECK-NEXT: vins.f16 s22, s30 -; CHECK-NEXT: vmovx.f16 s15, s23 -; CHECK-NEXT: vins.f16 s23, s31 -; CHECK-NEXT: vins.f16 s26, s0 -; CHECK-NEXT: vmovx.f16 s0, s31 -; CHECK-NEXT: vins.f16 s20, s28 -; CHECK-NEXT: vins.f16 s21, s29 -; CHECK-NEXT: vins.f16 s17, s1 -; CHECK-NEXT: vins.f16 s19, s3 -; CHECK-NEXT: vmov q2, q5 -; CHECK-NEXT: vins.f16 s15, s0 +; CHECK-NEXT: vmov q5, q1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmov q0, q4 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s9, s6 -; CHECK-NEXT: vmov.f32 s4, s18 -; CHECK-NEXT: vmov.f32 s12, s22 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmov.f32 s10, s21 -; CHECK-NEXT: vmov.f32 s14, s23 -; CHECK-NEXT: vmov.f32 s3, s24 -; CHECK-NEXT: vmov.f32 s11, s26 -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vcmla.f16 q0, q1, q6, #0 +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: vcmla.f16 q0, q1, q6, #90 +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: add r0, sp, #80 +; CHECK-NEXT: vcmla.f16 q1, q5, q6, #0 +; CHECK-NEXT: vcmla.f16 q1, q5, q6, #90 +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: add r0, sp, #96 +; CHECK-NEXT: vcmla.f16 q5, q2, q6, #0 +; CHECK-NEXT: vcmla.f16 q5, q2, q6, #90 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vcmla.f16 q4, q3, q2, #0 +; CHECK-NEXT: vcmla.f16 q4, q3, q2, #90 +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmov q3, q4 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> diff --git a/llvm/test/CodeGen/Thumb2/complex-arithmetic-f32-add.ll b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f32-add.ll --- a/llvm/test/CodeGen/Thumb2/complex-arithmetic-f32-add.ll +++ b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f32-add.ll @@ -25,19 +25,8 @@ define arm_aapcs_vfpcc <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: complex_add_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s8, s0 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s9, s2 -; CHECK-NEXT: vmov.f32 s13, s7 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vadd.f32 q2, q3, q2 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vsub.f32 q1, q1, q0 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vmov.f32 s0, s4 -; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vcadd.f32 q2, q1, q0, #90 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> @@ -52,34 +41,13 @@ define arm_aapcs_vfpcc <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: complex_add_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov.f32 s4, s0 -; CHECK-NEXT: vmov.f32 s5, s2 -; CHECK-NEXT: vmov.f32 s20, s9 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s7, s18 -; CHECK-NEXT: vmov.f32 s22, s13 -; CHECK-NEXT: vmov.f32 s23, s15 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vadd.f32 q1, q5, q1 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vsub.f32 q2, q2, q0 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s0, s8 -; CHECK-NEXT: vmov.f32 s2, s9 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcadd.f32 q4, q2, q0, #90 +; CHECK-NEXT: vcadd.f32 q2, q3, q1, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> @@ -96,64 +64,22 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: add r2, sp, #64 -; CHECK-NEXT: add r3, sp, #80 +; CHECK-NEXT: add r3, sp, #64 +; CHECK-NEXT: add r2, sp, #80 ; CHECK-NEXT: vldrw.u32 q5, [r3] -; CHECK-NEXT: vldrw.u32 q6, [r2] -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov.f32 s4, s0 -; CHECK-NEXT: vmov.f32 s5, s2 -; CHECK-NEXT: add r0, sp, #96 -; CHECK-NEXT: vmov.f32 s28, s25 -; CHECK-NEXT: add r1, sp, #112 -; CHECK-NEXT: vmov.f32 s29, s27 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s25, s26 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.f32 s26, s20 -; CHECK-NEXT: vmov.f32 s27, s22 -; CHECK-NEXT: vmov.f32 s6, s16 -; CHECK-NEXT: vmov.f32 s7, s18 -; CHECK-NEXT: vsub.f32 q4, q6, q0 -; CHECK-NEXT: vmov.f32 s30, s21 -; CHECK-NEXT: vldrw.u32 q6, [r0] -; CHECK-NEXT: vmov.f32 s31, s23 -; CHECK-NEXT: vldrw.u32 q5, [r1] -; CHECK-NEXT: vadd.f32 q1, q7, q1 -; CHECK-NEXT: vmov.f32 s0, s16 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov.f32 s4, s18 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vmov.f32 s16, s8 -; CHECK-NEXT: vmov.f32 s17, s10 -; CHECK-NEXT: vmov.f32 s28, s25 -; CHECK-NEXT: vmov.f32 s29, s27 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s25, s26 -; CHECK-NEXT: vmov.f32 s18, s12 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s30, s21 -; CHECK-NEXT: vmov.f32 s31, s23 -; CHECK-NEXT: vmov.f32 s10, s13 -; CHECK-NEXT: vadd.f32 q4, q7, q4 -; CHECK-NEXT: vmov.f32 s11, s15 -; CHECK-NEXT: vmov.f32 s26, s20 -; CHECK-NEXT: vmov.f32 s27, s22 -; CHECK-NEXT: vsub.f32 q3, q6, q2 -; CHECK-NEXT: vmov.f32 s9, s16 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov.f32 s17, s18 -; CHECK-NEXT: vmov.f32 s16, s14 -; CHECK-NEXT: vmov.f32 s18, s15 -; CHECK-NEXT: vmov.f32 s8, s12 -; CHECK-NEXT: vmov.f32 s10, s13 -; CHECK-NEXT: vmov q3, q4 +; CHECK-NEXT: add r1, sp, #96 +; CHECK-NEXT: add r0, sp, #112 +; CHECK-NEXT: vcadd.f32 q4, q5, q0, #90 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vcadd.f32 q5, q0, q1, #90 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vcadd.f32 q6, q0, q2, #90 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vcadd.f32 q7, q0, q3, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q3, q7 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/complex-arithmetic-f32-mul.ll b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f32-mul.ll --- a/llvm/test/CodeGen/Thumb2/complex-arithmetic-f32-mul.ll +++ b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f32-mul.ll @@ -30,25 +30,10 @@ define arm_aapcs_vfpcc <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: complex_mul_v4f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.f32 s12, s0 -; CHECK-NEXT: vmov.f32 s16, s5 -; CHECK-NEXT: vmov.f32 s13, s2 -; CHECK-NEXT: vmov.f32 s17, s7 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmul.f32 q2, q4, q3 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vfma.f32 q2, q1, q0 -; CHECK-NEXT: vmul.f32 q0, q0, q4 -; CHECK-NEXT: vneg.f32 q4, q0 -; CHECK-NEXT: vmov.f32 s1, s8 -; CHECK-NEXT: vfma.f32 q4, q1, q3 -; CHECK-NEXT: vmov.f32 s3, s9 -; CHECK-NEXT: vmov.f32 s0, s16 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> @@ -68,37 +53,17 @@ define arm_aapcs_vfpcc <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: complex_mul_v8f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov.f32 s20, s0 -; CHECK-NEXT: vmov.f32 s21, s2 -; CHECK-NEXT: vmov.f32 s24, s9 -; CHECK-NEXT: vmov.f32 s25, s11 -; CHECK-NEXT: vmov.f32 s22, s16 -; CHECK-NEXT: vmov.f32 s23, s18 -; CHECK-NEXT: vmov.f32 s26, s13 -; CHECK-NEXT: vmov.f32 s27, s15 -; CHECK-NEXT: vmov.f32 s0, s1 -; CHECK-NEXT: vmul.f32 q1, q6, q5 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vmov.f32 s9, s10 -; CHECK-NEXT: vmov.f32 s2, s17 -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vmov.f32 s10, s12 -; CHECK-NEXT: vmov.f32 s11, s14 -; CHECK-NEXT: vfma.f32 q1, q2, q0 -; CHECK-NEXT: vmul.f32 q0, q0, q6 -; CHECK-NEXT: vneg.f32 q3, q0 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vfma.f32 q3, q2, q5 -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s0, s12 -; CHECK-NEXT: vmov.f32 s2, s13 -; CHECK-NEXT: vmov.f32 s4, s14 -; CHECK-NEXT: vmov.f32 s6, s15 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vcmla.f32 q4, q1, q3, #0 +; CHECK-NEXT: vcmla.f32 q5, q0, q2, #0 +; CHECK-NEXT: vcmla.f32 q4, q1, q3, #90 +; CHECK-NEXT: vcmla.f32 q5, q0, q2, #90 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> @@ -118,78 +83,33 @@ define arm_aapcs_vfpcc <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: complex_mul_v16f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r2, sp, #96 -; CHECK-NEXT: add r3, sp, #112 -; CHECK-NEXT: vldrw.u32 q5, [r3] -; CHECK-NEXT: vldrw.u32 q4, [r2] -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s24, s1 -; CHECK-NEXT: vmov.f32 s12, s17 -; CHECK-NEXT: add r0, sp, #144 -; CHECK-NEXT: vmov.f32 s13, s19 -; CHECK-NEXT: add r1, sp, #128 -; CHECK-NEXT: vmov.f32 s25, s3 -; CHECK-NEXT: vmov.f32 s14, s21 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vmov.f32 s26, s5 -; CHECK-NEXT: vmov.f32 s27, s7 -; CHECK-NEXT: vmov.f32 s1, s2 -; CHECK-NEXT: vmul.f32 q7, q6, q3 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s6 -; CHECK-NEXT: vmul.f32 q1, q3, q0 -; CHECK-NEXT: vmov.f32 s17, s18 -; CHECK-NEXT: vmov.f32 s18, s20 -; CHECK-NEXT: vldrw.u32 q3, [r0] -; CHECK-NEXT: vmov.f32 s19, s22 -; CHECK-NEXT: vneg.f32 q5, q7 -; CHECK-NEXT: vfma.f32 q5, q4, q0 -; CHECK-NEXT: vfma.f32 q1, q4, q6 -; CHECK-NEXT: vmov.f32 s0, s20 -; CHECK-NEXT: vldrw.u32 q6, [r1] -; CHECK-NEXT: vmov.f32 s1, s4 -; CHECK-NEXT: vmov.f32 s2, s21 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s10 -; CHECK-NEXT: vmov.f32 s28, s25 -; CHECK-NEXT: vmov.f32 s29, s27 -; CHECK-NEXT: vmov.f32 s8, s9 -; CHECK-NEXT: vmov.f32 s9, s11 -; CHECK-NEXT: vmov.f32 s30, s13 -; CHECK-NEXT: vmov.f32 s31, s15 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vmov.f32 s11, s3 -; CHECK-NEXT: vmov.f32 s4, s22 -; CHECK-NEXT: vmov.f32 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s23 -; CHECK-NEXT: vmov.f32 s22, s0 -; CHECK-NEXT: vmov.f32 s23, s2 -; CHECK-NEXT: vmul.f32 q0, q2, q7 -; CHECK-NEXT: vmov.f32 s25, s26 -; CHECK-NEXT: vmul.f32 q4, q7, q5 -; CHECK-NEXT: vmov.f32 s26, s12 -; CHECK-NEXT: vneg.f32 q0, q0 -; CHECK-NEXT: vmov.f32 s27, s14 -; CHECK-NEXT: vfma.f32 q4, q6, q2 -; CHECK-NEXT: vfma.f32 q0, q6, q5 -; CHECK-NEXT: vmov.f32 s9, s16 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vmov.f32 s17, s18 -; CHECK-NEXT: vmov.f32 s16, s2 -; CHECK-NEXT: vmov.f32 s18, s3 -; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add r3, sp, #48 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vldrw.u32 q6, [r3] +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vcmla.f32 q0, q5, q6, #0 +; CHECK-NEXT: add r1, sp, #80 +; CHECK-NEXT: vcmla.f32 q0, q5, q6, #90 +; CHECK-NEXT: vldrw.u32 q6, [r2] +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: add r0, sp, #96 +; CHECK-NEXT: vcmla.f32 q5, q1, q6, #0 +; CHECK-NEXT: vcmla.f32 q5, q1, q6, #90 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vcmla.f32 q6, q2, q1, #0 +; CHECK-NEXT: vcmla.f32 q6, q2, q1, #90 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vcmla.f32 q4, q3, q1, #0 +; CHECK-NEXT: vcmla.f32 q4, q3, q1, #90 +; CHECK-NEXT: vmov q1, q5 ; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmov.f32 s10, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: add sp, #32 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: bx lr entry: %a.real = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/complex-arithmetic-f64-add.ll b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f64-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f64-add.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + + +define arm_aapcs_vfpcc <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_add_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f64 d3, d3, d0 +; CHECK-NEXT: vsub.f64 d2, d2, d1 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x double> %b.real, %a.imag + %1 = fadd fast <1 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> + ret <2 x double> %interleaved.vec +} +define arm_aapcs_vfpcc <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_add_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f64 d5, d5, d0 +; CHECK-NEXT: vsub.f64 d4, d4, d1 +; CHECK-NEXT: vadd.f64 d7, d7, d2 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vsub.f64 d6, d6, d3 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x double> %b.real, %a.imag + %1 = fadd fast <2 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> + ret <4 x double> %interleaved.vec +} +define arm_aapcs_vfpcc <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_add_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #48 +; CHECK-NEXT: vadd.f64 d1, d1, d2 +; CHECK-NEXT: vsub.f64 d0, d0, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: vadd.f64 d3, d3, d8 +; CHECK-NEXT: vsub.f64 d2, d2, d9 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #80 +; CHECK-NEXT: vadd.f64 d9, d9, d4 +; CHECK-NEXT: vsub.f64 d8, d8, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vadd.f64 d11, d5, d6 +; CHECK-NEXT: vsub.f64 d10, d4, d7 +; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x double> %b.real, %a.imag + %1 = fadd fast <4 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/complex-arithmetic-f64-mul.ll b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f64-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/complex-arithmetic-f64-mul.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +define arm_aapcs_vfpcc <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_mul_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.f64 d5, d3, d0 +; CHECK-NEXT: vmul.f64 d4, d1, d3 +; CHECK-NEXT: vfma.f64 d5, d2, d1 +; CHECK-NEXT: vfnms.f64 d4, d2, d0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %0 = fmul fast <1 x double> %b.imag, %a.real + %1 = fmul fast <1 x double> %b.real, %a.imag + %2 = fadd fast <1 x double> %1, %0 + %3 = fmul fast <1 x double> %b.real, %a.real + %4 = fmul fast <1 x double> %a.imag, %b.imag + %5 = fsub fast <1 x double> %3, %4 + %interleaved.vec = shufflevector <1 x double> %5, <1 x double> %2, <2 x i32> + ret <2 x double> %interleaved.vec +} + +define arm_aapcs_vfpcc <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_mul_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmul.f64 d9, d7, d2 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmul.f64 d8, d3, d7 +; CHECK-NEXT: vfma.f64 d9, d6, d3 +; CHECK-NEXT: vfnms.f64 d8, d6, d2 +; CHECK-NEXT: vmul.f64 d1, d5, d10 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmul.f64 d0, d11, d5 +; CHECK-NEXT: vfma.f64 d1, d4, d11 +; CHECK-NEXT: vfnms.f64 d0, d4, d10 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %b.imag, %a.real + %1 = fmul fast <2 x double> %b.real, %a.imag + %2 = fadd fast <2 x double> %1, %0 + %3 = fmul fast <2 x double> %b.real, %a.real + %4 = fmul fast <2 x double> %a.imag, %b.imag + %5 = fsub fast <2 x double> %3, %4 + %interleaved.vec = shufflevector <2 x double> %5, <2 x double> %2, <4 x i32> + ret <4 x double> %interleaved.vec +} + +define arm_aapcs_vfpcc <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_mul_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: add r0, sp, #128 +; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #160 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov q6, q0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: add r0, sp, #176 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d5, d3, d0 +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d4, d1, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: add r0, sp, #144 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d11, d3, d0 +; CHECK-NEXT: vmul.f64 d10, d1, d3 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmul.f64 d7, d9, d12 +; CHECK-NEXT: vmul.f64 d2, d15, d1 +; CHECK-NEXT: vmul.f64 d3, d1, d14 +; CHECK-NEXT: vmul.f64 d6, d13, d9 +; CHECK-NEXT: vfma.f64 d7, d8, d13 +; CHECK-NEXT: vfnms.f64 d6, d8, d12 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d3, d0, d15 +; CHECK-NEXT: vfnms.f64 d2, d0, d14 +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d5, d0, d9 +; CHECK-NEXT: vfnms.f64 d4, d0, d8 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d11, d0, d9 +; CHECK-NEXT: vfnms.f64 d10, d0, d8 +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %0 = fmul fast <4 x double> %b.imag, %a.real + %1 = fmul fast <4 x double> %b.real, %a.imag + %2 = fadd fast <4 x double> %1, %0 + %3 = fmul fast <4 x double> %b.real, %a.real + %4 = fmul fast <4 x double> %a.imag, %b.imag + %5 = fsub fast <4 x double> %3, %4 + %interleaved.vec = shufflevector <4 x double> %5, <4 x double> %2, <8 x i32> + ret <8 x double> %interleaved.vec +}