diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -0,0 +1,40 @@ +//===- ComplexDeinterleavingPass.h - Complex Deinterleaving Pass *- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements generation of target-specific intrinsics to support +// handling of complex number arithmetic and deinterleaving. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H +#define LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H + +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" + +namespace llvm { + +class Function; +class TargetMachine; + +struct ComplexDeinterleavingPass + : public PassInfoMixin { +private: + TargetMachine *TM; + +public: + ComplexDeinterleavingPass(TargetMachine *TM) : TM(TM) {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +enum class ComplexDeinterleavingOperation { None, CAdd, CMulPartial }; + +} // namespace llvm + +#endif // LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -80,6 +80,10 @@ /// matching during instruction selection. FunctionPass *createCodeGenPreparePass(); + /// This pass implements generation of target-specific intrinsics to support + /// handling of complex number arithmetic + FunctionPass *createComplexDeinterleavingPass(const TargetMachine *TM); + /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg /// load-linked/store-conditional loops. extern char &AtomicExpandID; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -22,6 +22,7 @@ #ifndef LLVM_CODEGEN_TARGETLOWERING_H #define LLVM_CODEGEN_TARGETLOWERING_H +#include "ComplexDeinterleavingPass.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -3051,6 +3052,27 @@ return isOperationLegalOrCustom(Op, VT); } + /// Does this target support complex deinterleaving + virtual bool isComplexDeinterleavingSupported() const { return false; } + + /// Does this target support complex deinterleaving with the given operation + /// and type + virtual bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + return false; + } + + /// Create the IR node for the given complex deinterleaving operation. + /// If one cannot be created using all the given inputs, nullptr should be + /// returned. + virtual Value * + createComplexDeinterleavingIR(Instruction *I, + ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const { + return nullptr; + } + //===--------------------------------------------------------------------===// // Runtime Library hooks // diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -101,26 +101,27 @@ void initializeCFIFixupPass(PassRegistry&); void initializeCFIInstrInserterPass(PassRegistry&); void initializeCFLAndersAAWrapperPassPass(PassRegistry&); -void initializeCFLSteensAAWrapperPassPass(PassRegistry&); +void initializeCFLSteensAAWrapperPassPass(PassRegistry &); void initializeCGProfileLegacyPassPass(PassRegistry &); -void initializeCallGraphDOTPrinterPass(PassRegistry&); -void initializeCallGraphPrinterLegacyPassPass(PassRegistry&); -void initializeCallGraphViewerPass(PassRegistry&); -void initializeCallGraphWrapperPassPass(PassRegistry&); -void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); +void initializeCallGraphDOTPrinterPass(PassRegistry &); +void initializeCallGraphPrinterLegacyPassPass(PassRegistry &); +void initializeCallGraphViewerPass(PassRegistry &); +void initializeCallGraphWrapperPassPass(PassRegistry &); +void initializeCallSiteSplittingLegacyPassPass(PassRegistry &); void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); void initializeCheckDebugMachineModulePass(PassRegistry &); -void initializeCodeGenPreparePass(PassRegistry&); -void initializeConstantHoistingLegacyPassPass(PassRegistry&); -void initializeConstantMergeLegacyPassPass(PassRegistry&); +void initializeCodeGenPreparePass(PassRegistry &); +void initializeComplexDeinterleavingLegacyPassPass(PassRegistry &); +void initializeConstantHoistingLegacyPassPass(PassRegistry &); +void initializeConstantMergeLegacyPassPass(PassRegistry &); void initializeConstraintEliminationPass(PassRegistry &); -void initializeControlHeightReductionLegacyPassPass(PassRegistry&); -void initializeCorrelatedValuePropagationPass(PassRegistry&); -void initializeCostModelAnalysisPass(PassRegistry&); -void initializeCrossDSOCFIPass(PassRegistry&); +void initializeControlHeightReductionLegacyPassPass(PassRegistry &); +void initializeCorrelatedValuePropagationPass(PassRegistry &); +void initializeCostModelAnalysisPass(PassRegistry &); +void initializeCrossDSOCFIPass(PassRegistry &); void initializeCycleInfoWrapperPassPass(PassRegistry &); -void initializeDAEPass(PassRegistry&); -void initializeDAHPass(PassRegistry&); +void initializeDAEPass(PassRegistry &); +void initializeDAHPass(PassRegistry &); void initializeDCELegacyPassPass(PassRegistry&); void initializeDFAJumpThreadingLegacyPassPass(PassRegistry &); void initializeDSELegacyPassPass(PassRegistry&); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -46,6 +46,7 @@ CodeGenPassBuilder.cpp CodeGenPrepare.cpp CommandFlags.cpp + ComplexDeinterleavingPass.cpp CriticalAntiDepBreaker.cpp DeadMachineInstructionElim.cpp DetectDeadLanes.cpp diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -0,0 +1,1092 @@ +//===- ComplexDeinterleavingPass.cpp +//------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass is broken down into 3 steps; Discovery, Identification, and +// Replacement. +// +// Discovery: +// The discovery step is responsible for finding the search space for the +// complex graph. The first Instruction pointer it takes is assumed to be the +// converging shuffle of the complex graph (identified by the mask representing +// an interleaving pattern. e.g. `<0, 2, 1, 3>`), and ascends through the +// operands depth-first to find the respective deinterleaving shuffles +// (identified by the mask being `<0, 2, 4, 6>` or `<1, 3, 5, 7>`). +// Beyond `Instructions[0]` being the converging shuffle, this step makes no +// guarantees as to the order of `Instructions`. +// +// Identification: +// This step is responsible for finding the patterns that can be lowered to +// complex instructions. Iterating over `Instructions`, it first performs some +// pattern matching to find a predictable partial multiply case, performing some +// analysis on the order and operating component of the operands to identify +// which rotation around the argand plane is represented by the pattern. The +// step then attempts to pair up "Orphaned" instructions (instructions that have +// no shared parent that would be part of the same node, e.g. an add and a sub +// that represent a complex add). After attempting to pair orphaned +// instructions, the presence of any instructions outside of composite nodes +// means that the graph cannot be lowered confidently, causing the pass to stop +// analysing the current graph. If it continues, the composite nodes are sorted +// to reflect the underlying instruction order, and the uses are checked to find +// any accumulator cases. +// +// Replacement: +// This step performs the necessary input wrangling (chasing values through +// accumulators, shuffles, and other composite nodes) in order for the target to +// know what to generate. While some additional checks are performed at this +// step, it is expected to finish successfully, while any errors should be +// caught via asserts. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ComplexDeinterleavingPass.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "complex-deinterleaving" + +STATISTIC(NumComplexIntrinsics, "Number of complex intrinsics generated"); + +static cl::opt ComplexArithmeticEnabled( + "enable-complex-arithmetic", + cl::desc("Enable generation of complex arithmetic instructions"), + cl::init(true), cl::Hidden); + +/// Checks the given mask, and determines whether said mask is interleaving. +/// +/// To be interleaving, a mask must alternate between `i` and `i + (Length / +/// 2)`, and must contain all numbers within the range of `[0..Length)` (e.g. a +/// 4x vector interleaving mask would be <0, 2, 1, 3>). +static bool isInterleavingMask(ArrayRef Mask); +/// Checks the given mask, and determines whether said mask is deinterleaving. +/// +/// To be interleaving, a mask must increment in steps of 2, and either start +/// with 0 or 1. +/// (e.g. an 8x vector deinterleaving mask would be either <0, 2, 4, 6> or +/// <1, 3, 5, 7>). +static bool isDeinterleavingMask(ArrayRef Mask); + +namespace { + +/// Creates an integer array of length \p len, where each item is \p step more +/// than the previous. An offset can be provided to specify the first element. +static SmallVector createArrayWithStep(int len, int step, int offset = 0) { + SmallVector Arr(len); + for (int j = 0; j < len; j++) + Arr[j] = (j * step) + offset; + return Arr; +} + +/// Creates a deinterleaving mask of the given length at the given offset. +/// A deinterleaving mask looks like <0, 2, 4, 6> or <1, 3, 5, 7> +static SmallVector createDeinterleavingMask(int len, int offset = 0) { + return createArrayWithStep(len, 2, offset); +} + +class ComplexDeinterleavingLegacyPass : public FunctionPass { +public: + static char ID; + + ComplexDeinterleavingLegacyPass(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) { + initializeComplexDeinterleavingLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "Complex Arithmetic Pass"; } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } + +private: + const TargetMachine *TM; +}; + +enum OperatingComponent { Real, Imaginary, Unknown }; + +class ComplexDeinterleavingGraph; +struct ComplexDeinterleavingCompositeNode { + + ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op) + : Operation(Op) {} + +private: + friend class ComplexDeinterleavingGraph; + +public: + SmallVector getOperands() { + SmallVector Ops; + + for (const auto &Inst : ContainedInstructions) { + for (Value *V : Inst->operands()) { + auto *I = dyn_cast(V); + if (!I || !contains(I)) { + Ops.push_back(V); + continue; + } + } + } + return Ops; + } + + Value *getOperand(unsigned Idx) { return getOperands()[Idx]; } + + unsigned getNumOperands() { return getOperands().size(); } + + SmallVector ContainedInstructions; + Value *OutputNode = nullptr; + Value *OriginalInput0 = nullptr; + Value *OriginalInput1 = nullptr; + Value *ReplacementNode = nullptr; + bool IsTopLevel = false; + ComplexDeinterleavingOperation Operation; + + bool UsesNegation = false; + unsigned Rotation = 0; + Value *Input0 = nullptr; + Value *Input1 = nullptr; + Value *Accumulator = nullptr; + Value *Accumulatee = nullptr; + + void addInstruction(Instruction *I) { ContainedInstructions.push_back(I); } + bool contains(Instruction *I) { + if (I == ReplacementNode) + return true; + + return llvm::find(ContainedInstructions, I) != ContainedInstructions.end(); + } +}; + +class ComplexDeinterleavingGraph { +private: + using NodePtr = std::shared_ptr; + + SmallVector Instructions; + SmallVector CompositeNodes; + + llvm::TargetTransformInfo::TargetCostKind CostKind = + llvm::TargetTransformInfo::TCK_Latency; + + InstructionCost CostOfIntrinsics; + + /// Determines the operating component of the given Value. + /// This is achieved by looking at the operating component of the Value's + /// operands and, based on the instruction, evaluates what the resulting + /// component would be. + OperatingComponent getOperatingComponentOfValue(Value *V) { + Instruction *I = dyn_cast_or_null(V); + if (!I) + return Unknown; + + if (auto *Shuffle = dyn_cast(I)) { + auto ShuffleMask = Shuffle->getShuffleMask(); + if (isDeinterleavingMask(ShuffleMask)) { + if (ShuffleMask[0] == 0) + return Real; + if (ShuffleMask[0] == 1) + return Imaginary; + } + return Unknown; + } + + if (I->getOpcode() == Instruction::FMul) { + auto Op0Component = getOperatingComponentOfValue(I->getOperand(0)); + auto Op1Component = getOperatingComponentOfValue(I->getOperand(1)); + if (Op0Component == Unknown || Op1Component == Unknown) + return Unknown; + if (Op0Component == Op1Component) + return Real; + return Imaginary; + } + + if (I->getOpcode() == Instruction::FNeg) + return getOperatingComponentOfValue(I->getOperand(0)); + + if (I->getOpcode() == Instruction::FAdd || + I->getOpcode() == Instruction::FSub) { + auto Op0Component = getOperatingComponentOfValue(I->getOperand(0)); + auto Op1Component = getOperatingComponentOfValue(I->getOperand(1)); + if (Op0Component != Op1Component || Op1Component == Unknown) + return Unknown; + return Op0Component; + } + + return Unknown; + } + + void addInstruction(Instruction *I) { Instructions.push_back(I); } + + void sortCompositeNodes(BasicBlock *B) { + SmallVector NewNodeList; + + // Sort the nodelist based on the instruction order + for (auto &I : *B) { + if (auto CN = findNodeFromOutput(&I)) + NewNodeList.push_back(CN); + } + + for (unsigned i = 0; i < NewNodeList.size(); i++) + CompositeNodes[i] = NewNodeList[i]; + } + + NodePtr findNodeFromOutput(Instruction *I) { + for (const auto &Item : CompositeNodes) { + if (Item->OutputNode == I) + return Item; + } + + return nullptr; + } + + SmallVector findUnmatchedInstructions() { + SmallVector Is; + for (auto &I : Instructions) { + if (shouldIgnoreValue(I)) + continue; + if (getContainingComposite(I) == nullptr) + Is.push_back(I); + } + return Is; + } + + Value *getSharedOperand(Instruction *A, Instruction *B, unsigned &Idx) { + if (A->getNumOperands() != B->getNumOperands()) + return nullptr; + + for (unsigned OpIdx = 0; OpIdx < A->getNumOperands(); OpIdx++) { + auto *Op = A->getOperand(OpIdx); + if (Op == B->getOperand(OpIdx)) { + Idx = OpIdx; + return Op; + } + } + return nullptr; + } + + bool haveSharedUses(Value *A, Value *B) { + if (A->hasOneUser() && B->hasOneUser()) { + auto *AUser = *A->user_begin(); + auto *BUser = *B->user_begin(); + + if (AUser && AUser == BUser) + return true; + + auto AUCN = getContainingComposite(dyn_cast(AUser)); + auto BUCN = getContainingComposite(dyn_cast(BUser)); + + if (AUCN && AUCN == BUCN) + return true; + } + + if (A->getNumUses() != B->getNumUses()) + return false; + + for (const auto &AUser : A->users()) { + bool Found = false; + auto AUCN = getContainingComposite(dyn_cast(AUser)); + for (const auto &BUser : B->users()) { + if (AUser == BUser) { + Found = true; + break; + } + auto BUCN = getContainingComposite(dyn_cast(BUser)); + if (AUCN && AUCN == BUCN) { + Found = true; + break; + } + } + if (!Found) { + LLVM_DEBUG(dbgs() << "AUser doesn't have a match: "; AUser->dump()); + return false; + } + } + + return true; + } + + Value *followUseChain(Value *V) { + if (V->hasOneUser()) + return followUseChain(*V->user_begin()); + + // TODO handle multiple users, but how? + + return V; + } + + Value *getFinalInputReplacement(Instruction *I) { + for (Value *V : I->operands()) { + auto *Op = dyn_cast(V); + while (Op && shouldIgnoreValue(Op)) + Op = dyn_cast(Op->getOperand(0)); + if (Op == nullptr) + continue; + + auto CN = getContainingComposite(Op); + if (CN == nullptr || CN->ReplacementNode == nullptr) + continue; + return followUseChain(CN->ReplacementNode); + } + + return nullptr; + } + + Value *getReplacement(Instruction *I) { + if (!I) + return nullptr; + auto CN = getContainingComposite(I); + if (CN == nullptr || CN->ReplacementNode == nullptr) + return I; + return CN->ReplacementNode; + } + + std::shared_ptr + prepareCompositeNode(ComplexDeinterleavingOperation Operation) { + return std::make_shared(Operation); + } + + void + submitCompositeNode(std::shared_ptr CN) { + CompositeNodes.push_back(CN); + } + + bool containsNode(Instruction *I) { + return llvm::find(Instructions, I) != Instructions.end(); + } + + /// Certain values, such as extends and truncates, should be ignored within + /// the graph for our needs as they contribute towards structure rather than + /// function. + /// + /// e.g. A deinterleaving shuffle provides no functionality itself, + /// and does not need to be explicitly handled beyond the usual operations. A + /// shuffle that is neither interleaving nor deinterleaving is an example of + /// one that needs to be handled, and thus should not be ignored. + bool shouldIgnoreValue(Value *V) { + if (isa(V)) + return true; + + if (auto *SVI = dyn_cast(V)) { + auto Mask = SVI->getShuffleMask(); + return isInterleavingMask(Mask) || isDeinterleavingMask(Mask); + } + + if (auto *I = dyn_cast(V)) { + auto Opc = I->getOpcode(); + return I->isCast() || Opc == Instruction::FPTrunc || + Opc == Instruction::FPExt; + } + return false; + } + + /// Checks the users of the given instructions to evaluate whether the + /// returns from said instructions converge at any point. e.g. in a shuffle + bool doInstructionsConverge(Instruction *A, Instruction *B) { + if (A->hasOneUser() && B->hasOneUser()) { + auto *AUser = *A->user_begin(); + auto *BUser = *B->user_begin(); + + while (shouldIgnoreValue(AUser)) + AUser = *AUser->user_begin(); + while (shouldIgnoreValue(BUser)) + BUser = *BUser->user_begin(); + + if (AUser == BUser) + return true; + } + + return haveSharedUses(A, B); + } + + NodePtr getContainingComposite(Instruction *I) { + if (I == nullptr) + return nullptr; + for (const auto &CN : CompositeNodes) { + if (CN->contains(I)) + return CN; + if (CN->ReplacementNode == I) + return CN; + } + return nullptr; + } + + bool identifyCMulPartial(Instruction *I, const TargetLowering *TL, + bool &ContinueIdentification); + bool identifyOrphanedCMulPartial(Instruction *I, Instruction *J, + const TargetLowering *TL, + bool &ContinueIdentification); + bool identifyCAdd(Instruction *I, Instruction *J, const TargetLowering *TL, + bool &ContinueIdentification); + +public: + /// Step through the use-def chains to find all instruction nodes converging + /// on \p I. + void discoverNodes(BasicBlock *B, Instruction *I); + /// Iterate over the nodes and reducing them to complex nodes where possible. + /// Returns false if the deinterleaving operation should be cancelled for the + /// current graph. + bool identifyNodes(const TargetLowering *TL); + /// Perform the actual replacement of the underlying instruction graph. + /// Returns false if the deinterleaving operation should be cancelled for the + /// current graph. + bool replaceNodes(const TargetLowering *TL); + void getDeadRoots(SmallVector &DeadInstRoots); +}; + +class ComplexDeinterleaving { +public: + ComplexDeinterleaving(const TargetLowering *tl, const TargetLibraryInfo *tli) + : TL(tl), TLI(tli) {} + bool runOnFunction(Function &F); + +private: + bool evaluateBasicBlock(BasicBlock *B); + + const TargetLowering *TL = nullptr; + const TargetLibraryInfo *TLI = nullptr; +}; + +} // namespace + +char ComplexDeinterleavingLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, + "Complex Deinterleaving", false, false) +INITIALIZE_PASS_END(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, + "Complex Deinterleaving", false, false) + +PreservedAnalyses ComplexDeinterleavingPass::run(Function &F, + FunctionAnalysisManager &AM) { + const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + auto &TLI = AM.getResult(F); + if (!ComplexDeinterleaving(TL, &TLI).runOnFunction(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} + +FunctionPass *llvm::createComplexDeinterleavingPass(const TargetMachine *TM) { + return new ComplexDeinterleavingLegacyPass(TM); +} + +bool ComplexDeinterleavingLegacyPass::runOnFunction(Function &F) { + const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + auto TLI = getAnalysis().getTLI(F); + return ComplexDeinterleaving(TL, &TLI).runOnFunction(F); +} + +bool ComplexDeinterleaving::runOnFunction(Function &F) { + if (!ComplexArithmeticEnabled) { + LLVM_DEBUG(dbgs() << "Complex has been explicitly disabled.\n"); + return false; + } + + if (!TL->isComplexDeinterleavingSupported()) { + LLVM_DEBUG(dbgs() << "Complex has been disabled, target does not support " + "lowering of complex numbers.\n"); + return false; + } + + bool Changed = false; + for (auto &B : F) + Changed |= evaluateBasicBlock(&B); + + return Changed; +} + +static bool isInterleavingMask(ArrayRef Mask) { + int HalfNumElements = Mask.size() / 2; + + for (int Idx = 0; Idx < HalfNumElements; ++Idx) { + if (Mask[(Idx * 2) + 1] != (Mask[Idx * 2] + HalfNumElements)) + return false; + } + + return true; +} + +static bool isDeinterleavingMask(ArrayRef Mask) { + int Offset = Mask[0]; + int HalfNumElements = Mask.size() / 2; + + for (int Idx = 1; Idx < HalfNumElements; ++Idx) { + if (Mask[Idx] != (Idx * 2) + Offset) + return false; + } + + return true; +} + +bool ComplexDeinterleaving::evaluateBasicBlock( + BasicBlock *B) { + bool Changed = false; + + SmallVector DeadInstrRoots; + + for (auto &I : *B) { + if (auto *SVI = dyn_cast(&I)) { + // Look for a shufflevector that takes separate vectors of the real and + // imaginary components and recombines them into a single vector. + if (isInterleavingMask(SVI->getShuffleMask())) { + ComplexDeinterleavingGraph Graph; + Graph.discoverNodes(B, SVI); + if (Graph.identifyNodes(TL) && Graph.replaceNodes(TL)) { + Changed = true; + DeadInstrRoots.push_back(SVI); + } else { + SmallVector DeadInstrs; + Graph.getDeadRoots(DeadInstrs); + for (auto It = DeadInstrs.rbegin(); It != DeadInstrs.rend(); It++) + (*It)->eraseFromParent(); + } + } + } + } + + for (const auto &I : DeadInstrRoots) + llvm::RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + + return Changed; +} + +bool ComplexDeinterleavingGraph::identifyCMulPartial( + Instruction *I, const TargetLowering *TL, bool &ContinueIdentification) { + if ((match(I, m_FAdd(m_FMul(m_Value(), m_Value()), + m_FMul(m_Value(), m_Value()))) || + match(I, m_FSub(m_FMul(m_Value(), m_Value()), + m_FMul(m_Value(), m_Value()))))) { + + auto *VTy = dyn_cast(I->getType()); + if (!VTy) + return false; + + auto *NewVTy = + FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + + if (!TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy)) + return false; + + LLVM_DEBUG(dbgs() << "Composite node built up from "; N->dump()); + auto CN = + prepareCompositeNode(llvm::ComplexDeinterleavingOperation::CMulPartial); + + auto *Op0 = cast(I->getOperand(0)); + auto *Op1 = cast(I->getOperand(1)); + + CN->addInstruction(I); + CN->addInstruction(Op0); + CN->addInstruction(Op1); + + CN->OriginalInput0 = Op0; + CN->OriginalInput1 = Op1; + + bool ContainsNeg = false; + for (Value *V : Op0->operands()) { + auto *Op = dyn_cast(V); + if (Op && Op->getOpcode() == Instruction::FNeg) { + if (ContainsNeg) + break; + CN->addInstruction(Op); + CN->OriginalInput0 = Op; + ContainsNeg = true; + } + } + for (Value *V : Op1->operands()) { + auto *Op = dyn_cast(V); + if (Op && Op->getOpcode() == Instruction::FNeg) { + if (ContainsNeg) + break; + CN->addInstruction(Op); + CN->OriginalInput1 = Op; + ContainsNeg = true; + } + } + + if (!ContainsNeg) { + auto &Use = (*I->use_begin()); + if (I->getOpcode() == Instruction::FSub) { + if (isa(Use.getUser()) && Use.getOperandNo() != 0) { + LLVM_DEBUG(dbgs() + << "First converging shuffle operand should be an FSub" + << ".\n"); + ContinueIdentification = false; + return false; + } + } else if (I->getOpcode() == Instruction::FAdd) { + if (isa(Use.getUser()) && Use.getOperandNo() != 1) { + LLVM_DEBUG(dbgs() + << "Second converging shuffle operand should be an FAdd" + << ".\n"); + return false; + } + } + } + + auto Pattern = m_BinOp(m_Shuffle(m_Value(), m_Value()), + m_Shuffle(m_Value(), m_Value())); + CN->IsTopLevel = match(CN->OriginalInput0, Pattern) && + match(CN->OriginalInput1, Pattern); + CN->UsesNegation = ContainsNeg; + CN->OutputNode = I; + + CN->Rotation = (I->getOpcode() == Instruction::FAdd) * 90; + + if (I->getOpcode() == Instruction::FSub) { + auto *SubOp0 = cast(I->getOperand(0)); + auto SubOp0C0 = getOperatingComponentOfValue(SubOp0->getOperand(0)); + auto SubOp0C1 = getOperatingComponentOfValue(SubOp0->getOperand(1)); + + if (SubOp0C0 == SubOp0C1) { + if (SubOp0C0 == OperatingComponent::Imaginary) { + CN->Rotation += 90; + } + } + } + + if (CN->UsesNegation) + CN->Rotation += 180; + + submitCompositeNode(CN); + return true; + } + ContinueIdentification = true; + return false; +} + +bool ComplexDeinterleavingGraph::identifyOrphanedCMulPartial( + Instruction *I, Instruction *J, const TargetLowering *TL, + bool &ContinueIdentification) { + if ((I->getOpcode() == Instruction::FMul && + J->getOpcode() == Instruction::FMul)) { + + // At this point, all operands should be instructions + if (!isa(I->getOperand(0)) || + !isa(I->getOperand(1))) + return false; + if (!isa(J->getOperand(0)) || + !isa(J->getOperand(1))) + return false; + + auto *VTy = dyn_cast(I->getType()); + if (!VTy) + return false; + + auto *NewVTy = + FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + + if (!TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy)) + return false; + + // Partial mul + auto CN = + prepareCompositeNode(llvm::ComplexDeinterleavingOperation::CMulPartial); + CN->addInstruction(I); + CN->addInstruction(J); + CN->OriginalInput0 = I; + CN->OriginalInput1 = J; + + unsigned SharedIdx; + auto *SharedOp = + dyn_cast_or_null(getSharedOperand(I, J, SharedIdx)); + if (SharedOp) { + auto Opc = SharedOp->getOpcode(); + if (Opc == Instruction::FNeg) { + if (SharedIdx == 0) + CN->OriginalInput0 = SharedOp; + else if (SharedIdx == 1) + CN->OriginalInput1 = SharedOp; + else { + LLVM_DEBUG(dbgs() << "Unknown input pattern, somehow the shared " + "operand index is greater than 1.\n"); + return false; + } + CN->addInstruction(SharedOp); + CN->UsesNegation = true; + } + } + + auto Pattern = m_BinOp(m_Shuffle(m_Value(), m_Value()), + m_Shuffle(m_Value(), m_Value())); + CN->IsTopLevel = match(CN->OriginalInput0, Pattern) && + match(CN->OriginalInput1, Pattern); + CN->OutputNode = J; + submitCompositeNode(CN); + return true; + } + ContinueIdentification = true; + return false; +} + +bool ComplexDeinterleavingGraph::identifyCAdd(Instruction *I, Instruction *J, + const TargetLowering *TL, + bool &ContinueIdentification) { + if (((I->getOpcode() == Instruction::FSub && + J->getOpcode() == Instruction::FAdd) || + (I->getOpcode() == Instruction::FAdd && + J->getOpcode() == Instruction::FSub))) { + + auto *VTy = dyn_cast(I->getType()); + if (!VTy) + return false; + + auto *NewVTy = + FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + + if (!TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CAdd, NewVTy)) + return false; + + LLVM_DEBUG(dbgs() << "Pairing instructions as a CAdd.\n"); + auto CN = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd); + CN->addInstruction(I); + CN->addInstruction(J); + CN->OriginalInput0 = I; + CN->OriginalInput1 = J; + + auto *Sub = I->getOpcode() == Instruction::FSub ? I : J; + bool IsLikelyNegated = false; + if (auto *Shuffle = dyn_cast(Sub->getOperand(0))) { + auto ShuffleMask = Shuffle->getShuffleMask(); + if (isDeinterleavingMask(ShuffleMask)) + IsLikelyNegated = ShuffleMask[0] == 1; + } + + if (IsLikelyNegated) { + LLVM_DEBUG(dbgs() << "Negated adds are not yet supported.\n"); + return false; + } + + CN->UsesNegation = IsLikelyNegated; + CN->Rotation = 90; + if (I->getOpcode() == Instruction::FAdd) + CN->Rotation = 270; + CN->OutputNode = J; + + Instruction *FAdd = I; + if (FAdd->getOpcode() != Instruction::FAdd) + FAdd = J; + if (getOperatingComponentOfValue(FAdd->getOperand(1)) != + OperatingComponent::Real) { + LLVM_DEBUG(dbgs() << "CAdd.FAdd[1] should be the real component.\n"); + return false; + } + + submitCompositeNode(CN); + return true; + } + ContinueIdentification = true; + return false; +} + +void ComplexDeinterleavingGraph::discoverNodes(BasicBlock *B, Instruction *I) { + + if (I->getParent() != B) + return; + + if (containsNode(I)) + return; + + if (isa(I) || isa(I)) { + // No need to discover beyond a load or a phi. + return; + } + + addInstruction(I); + + if (auto *SVI = dyn_cast(I)) { + auto ShuffleMask = SVI->getShuffleMask(); + + static const int RealMask[] = {0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30}; + static const int ImagMask[] = {1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31}; + + ArrayRef RealMaskRef(RealMask, ShuffleMask.size()); + ArrayRef ImagMaskRef(ImagMask, ShuffleMask.size()); + + Value *ShuffleSource; + if (match(SVI, m_Shuffle(m_Value(ShuffleSource), m_Poison(), + m_SpecificMask(RealMaskRef))) || + match(SVI, m_Shuffle(m_Value(ShuffleSource), m_Poison(), + m_SpecificMask(ImagMaskRef)))) { + // Reached "top" of graph, stop discovering. + // TODO this check needs refining + if (ShuffleSource && + (isa(ShuffleSource) || !isa(ShuffleSource))) + return; + } + } + + for (const auto &Op : I->operands()) { + if (auto *OpI = dyn_cast(Op)) + discoverNodes(B, OpI); + } +} + +bool ComplexDeinterleavingGraph::identifyNodes(const TargetLowering *TL) { + if (Instructions.empty()) { + LLVM_DEBUG(dbgs() << "No Instructions, nothing to identify.\n"); + return false; + } + + auto *ConvergingI = Instructions[0]; + + for (auto &I : Instructions) { + bool ContinueIdentification = false; + if (!identifyCMulPartial(I, TL, ContinueIdentification) && + !ContinueIdentification) + return false; + } + + auto Unmatched = findUnmatchedInstructions(); + SmallVector Pairs; + + for (auto &I : Unmatched) { + if (llvm::find(Pairs, I) != Pairs.end()) + continue; + for (auto &J : Unmatched) { + if (I == J || llvm::find(Pairs, J) != Pairs.end()) + continue; + + if (doInstructionsConverge(I, J)) { + Pairs.push_back(I); + Pairs.push_back(J); + break; + } + } + } + + // Try match found pairs + for (unsigned i = 0; i < Pairs.size(); i += 2) { + auto *I = Pairs[i]; + auto *J = Pairs[i + 1]; + + bool ContinueIdentification = false; + if (!identifyOrphanedCMulPartial(I, J, TL, ContinueIdentification)) { + if (ContinueIdentification) + continue; + return false; + } + + ContinueIdentification = false; + if (!identifyCAdd(I, J, TL, ContinueIdentification)) { + if (ContinueIdentification) + continue; + return false; + } + } + + auto UnmatchedInstructions = findUnmatchedInstructions(); + if (!UnmatchedInstructions.empty()) { + LLVM_DEBUG(dbgs() << "Unmatched instructions found in graph, cannot " + "confidently generate complex intrinsics.\n";); + return false; + } + + if (CompositeNodes.empty()) { + LLVM_DEBUG(dbgs() << "No composite nodes found.\n"); + return false; + } + + sortCompositeNodes(ConvergingI->getParent()); + + for (auto *It = CompositeNodes.begin() + 1; It != CompositeNodes.end(); + It++) { + auto CN = *It; + auto PrevCN = *(It - 1); + if (haveSharedUses(CN->OutputNode, PrevCN->OutputNode)) { + CN->Accumulator = PrevCN->OutputNode; + PrevCN->Accumulatee = CN->OutputNode; + } + } + + return true; +} + +bool ComplexDeinterleavingGraph::replaceNodes(const TargetLowering *TL) { + if (CompositeNodes.empty()) + return false; + + unsigned GeneratedIntrinsics = 0; + auto *ConvergingI = Instructions[0]; + + auto TTI = TL->getTargetMachine().getTargetTransformInfo( + *ConvergingI->getFunction()); + for (const auto &CN : CompositeNodes) { + auto *N = cast(CN->OutputNode); + + // Wrangle the inputs + + /// If the given value is part of a CompositeNode, and said node is part of + /// an accumulator chain, return the accumulator. Otherwise, returns the + /// "best fit" value (the ReplacementNode of a containing CompositeNode, or + /// the value itself) + auto FollowAccumulatorIfNecessary = [&](Value *V) -> Value * { + auto *I = dyn_cast(V); + if (!I) + return V; + + auto CN = getContainingComposite(I); + if (!CN) + return I; + + if (CN->Accumulatee) + CN = getContainingComposite(cast(CN->Accumulatee)); + + return CN->ReplacementNode; + }; + + /// Given a value and an operand index, get said operand and return it. + /// If the discovered operand is part of a composite node, return the + /// replacement instead. + auto GetInputFromOriginalInput = [&](Value *OriginalInput, + unsigned OpIdx) -> Value * { + auto *OriginalI = cast(OriginalInput); + if (OriginalI->getOpcode() == Instruction::FNeg) + OpIdx = 0; + + auto *Op = OriginalI->getOperand(OpIdx); + if (auto *SVI = dyn_cast(Op)) + Op = SVI->getOperand(0); + + if (!Op) + return nullptr; + + if (auto *I = dyn_cast(Op)) { + if (auto Containing = getContainingComposite(I)) { + if (Containing->ReplacementNode) + return Containing->ReplacementNode; + } + } + return Op; + }; + + if (CN->Operation == llvm::ComplexDeinterleavingOperation::CAdd) { + Value *Sub = nullptr; + if (auto *Op0 = dyn_cast(CN->OriginalInput0)) { + if (Op0->getOpcode() == Instruction::FSub) + Sub = Op0; + } + if (!Sub) { + if (auto *Op1 = dyn_cast(CN->OriginalInput1)) { + if (Op1->getOpcode() == Instruction::FSub) + Sub = Op1; + } + } + + if (!Sub) + return false; + + CN->Input0 = + FollowAccumulatorIfNecessary(GetInputFromOriginalInput(Sub, 0)); + CN->Input1 = + FollowAccumulatorIfNecessary(GetInputFromOriginalInput(Sub, 1)); + } else { + CN->Input0 = FollowAccumulatorIfNecessary( + GetInputFromOriginalInput(CN->OriginalInput0, 0)); + CN->Input1 = FollowAccumulatorIfNecessary( + GetInputFromOriginalInput(CN->OriginalInput1, 0)); + + if (CN->OriginalInput0 != CN->OriginalInput1 && CN->Input0 == CN->Input1) + CN->Input1 = FollowAccumulatorIfNecessary( + GetInputFromOriginalInput(CN->OriginalInput1, 1)); + } + + if (CN->Input0 == nullptr || CN->Input1 == nullptr) + continue; + + if (CN->Accumulator) { + if (auto Node = + getContainingComposite(cast(CN->Accumulator))) + CN->Accumulator = cast(Node->ReplacementNode); + } + + if (CN->Operation == llvm::ComplexDeinterleavingOperation::CMulPartial && + CN->Accumulator) { + if (auto Node = + getContainingComposite(cast(CN->Accumulator))) { + bool Valid90 = (Node->Rotation == 0 && CN->Rotation == 90) || + (Node->Rotation == 90 && CN->Rotation == 0); + bool Valid270 = (Node->Rotation == 180 && CN->Rotation == 270) || + (Node->Rotation == 270 && CN->Rotation == 180); + if (!Valid90 && !Valid270) { + LLVM_DEBUG(dbgs() << "Invalid rotation pairs.\n"); + return false; + } + + CN->Input0 = Node->Input0; + CN->Input1 = Node->Input1; + } + } + + CN->ReplacementNode = TL->createComplexDeinterleavingIR( + N, CN->Operation, CN->Rotation, CN->Input0, CN->Input1, + CN->Accumulator); + if (!CN->ReplacementNode) { + LLVM_DEBUG(dbgs() << "Target failed to create Intrinsic call.\n"); + return false; + } + + cast(CN->ReplacementNode) + ->moveAfter(cast(CN->OutputNode)); + + CostOfIntrinsics += TTI.getInstructionCost( + cast(CN->ReplacementNode), CostKind); + GeneratedIntrinsics += 1; + } + + auto *R = getFinalInputReplacement(ConvergingI); + if (!R) { + LLVM_DEBUG(dbgs() << "Unable to find Final Input Replacement.\n"); + return false; + } + + InstructionCost CostOfNodes; + for (const auto &I : Instructions) + CostOfNodes += TTI.getInstructionCost(I, CostKind); + + LLVM_DEBUG(dbgs() << "Evaluating cost of each graph. Instructions: " + << CostOfNodes << ", Intrinsics: " << CostOfIntrinsics + << ".\n"); + if (CostOfIntrinsics > CostOfNodes) { + LLVM_DEBUG(dbgs() << "Not replacing, cost was too high.\n"); + return false; + } + + ConvergingI->replaceAllUsesWith(R); + + NumComplexIntrinsics += GeneratedIntrinsics; + + return true; +} + +void ComplexDeinterleavingGraph::getDeadRoots( + SmallVector &DeadInstrRoots) { + for (const auto &CN : CompositeNodes) { + if (auto *I = dyn_cast_or_null(CN->ReplacementNode)) + DeadInstrRoots.push_back(I); + } +} diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -740,6 +740,15 @@ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool isComplexDeinterleavingSupported() const override; + bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const override; + + Value *createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const override; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21727,3 +21727,77 @@ MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } + +bool ARMTargetLowering::isComplexDeinterleavingSupported() const { + return Subtarget->hasMVEFloatOps(); +} + +bool ARMTargetLowering::isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + auto *VTy = dyn_cast(Ty); + if (!VTy) + return false; + + if (VTy->getNumElements() * VTy->getScalarSizeInBits() != 128) + return false; + + // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32 + auto *ScalarTy = VTy->getScalarType(); + if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy()) + return true; + + return false; +} + +Value *ARMTargetLowering::createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { + + IRBuilder<> B(I); + auto *IntTy = Type::getInt32Ty(B.getContext()); + auto *Ty = InputA->getType(); + + if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + + ConstantInt *ConstMulRot = nullptr; + + if (Rotation == 0) + ConstMulRot = ConstantInt::get(IntTy, 0); + else if (Rotation == 90) + ConstMulRot = ConstantInt::get(IntTy, 1); + else if (Rotation == 180) + ConstMulRot = ConstantInt::get(IntTy, 2); + else if (Rotation == 270) + ConstMulRot = ConstantInt::get(IntTy, 3); + + if (!ConstMulRot) + return nullptr; + + if (Accumulator) + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty, + {ConstMulRot, Accumulator, InputB, InputA}); + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty, + {ConstMulRot, InputB, InputA}); + } + + if (OperationType == ComplexDeinterleavingOperation::CAdd) { + + // 1 means the value is not halved. + unsigned HalvingVal = 1; + auto *Halving = ConstantInt::get(IntTy, HalvingVal); + + unsigned RotKey; + if (Rotation == 90) + RotKey = 0; + else if (Rotation == 270) + RotKey = 1; + else + return nullptr; // Invalid rotation for arm_mve_vcaddq + + auto *RotVal = ConstantInt::get(IntTy, RotKey); + return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty, + {Halving, RotVal, InputA, InputB}); + } + + return nullptr; +} diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -426,12 +426,17 @@ TargetPassConfig::addIRPasses(); // Run the parallel DSP pass. - if (getOptLevel() == CodeGenOpt::Aggressive) + if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createARMParallelDSPPass()); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexDeinterleavingPass(TM)); + // Match interleaved memory accesses to ldN/stN intrinsics. - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedAccessPass()); + } // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll @@ -0,0 +1,301 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + + +define arm_aapcs_vfpcc <2 x half> @complex_add_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: complex_add_v2f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vadd.f16 s2, s2, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vsub.f16 s0, s4, s0 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x half> %b.real, %a.imag + %1 = fadd fast <1 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> + ret <2 x half> %interleaved.vec +} +define arm_aapcs_vfpcc <4 x half> @complex_add_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: complex_add_v4f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vadd.f16 q3, q3, q0 +; CHECK-NEXT: vsub.f16 q0, q1, q2 +; CHECK-NEXT: vmovx.f16 s1, s0 +; CHECK-NEXT: vmovx.f16 s2, s12 +; CHECK-NEXT: vins.f16 s0, s12 +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x half> %b.real, %a.imag + %1 = fadd fast <2 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> + ret <4 x half> %interleaved.vec +} +define arm_aapcs_vfpcc <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: complex_add_v8f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcadd.f16 q0, q1, q0, #90 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x half> %b.real, %a.imag + %1 = fadd fast <4 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> + ret <8 x half> %interleaved.vec +} +define arm_aapcs_vfpcc <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) { +; CHECK-LABEL: complex_add_v16f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmov.f32 s17, s2 +; CHECK-NEXT: vins.f16 s16, s1 +; CHECK-NEXT: vmovx.f16 s1, s1 +; CHECK-NEXT: vins.f16 s17, s3 +; CHECK-NEXT: vmovx.f16 s20, s8 +; CHECK-NEXT: vmovx.f16 s18, s9 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s1, s2 +; CHECK-NEXT: vmovx.f16 s3, s3 +; CHECK-NEXT: vins.f16 s20, s18 +; CHECK-NEXT: vmovx.f16 s21, s10 +; CHECK-NEXT: vmovx.f16 s18, s11 +; CHECK-NEXT: vmovx.f16 s22, s12 +; CHECK-NEXT: vmovx.f16 s24, s13 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vmovx.f16 s3, s5 +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vmov.f32 s19, s6 +; CHECK-NEXT: vins.f16 s21, s18 +; CHECK-NEXT: vmov.f32 s18, s4 +; CHECK-NEXT: vins.f16 s22, s24 +; CHECK-NEXT: vmovx.f16 s23, s14 +; CHECK-NEXT: vmovx.f16 s24, s15 +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vmovx.f16 s4, s7 +; CHECK-NEXT: vmovx.f16 s3, s6 +; CHECK-NEXT: vins.f16 s10, s11 +; CHECK-NEXT: vins.f16 s14, s15 +; CHECK-NEXT: vins.f16 s12, s13 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vmov.f32 s9, s10 +; CHECK-NEXT: vins.f16 s18, s5 +; CHECK-NEXT: vins.f16 s19, s7 +; CHECK-NEXT: vins.f16 s23, s24 +; CHECK-NEXT: vins.f16 s3, s4 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vadd.f16 q4, q5, q4 +; CHECK-NEXT: vsub.f16 q2, q2, q0 +; CHECK-NEXT: vmovx.f16 s0, s16 +; CHECK-NEXT: vmovx.f16 s4, s8 +; CHECK-NEXT: vmovx.f16 s5, s10 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vmovx.f16 s7, s11 +; CHECK-NEXT: vins.f16 s11, s19 +; CHECK-NEXT: vins.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vins.f16 s8, s16 +; CHECK-NEXT: vins.f16 s10, s18 +; CHECK-NEXT: vins.f16 s9, s17 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s2, s9 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vmov.f32 s3, s12 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x half> %b.real, %a.imag + %1 = fadd fast <8 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> + ret <16 x half> %interleaved.vec +} +define arm_aapcs_vfpcc <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) { +; CHECK-LABEL: complex_add_v32f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vmovx.f16 s18, s1 +; CHECK-NEXT: add r0, sp, #80 +; CHECK-NEXT: vins.f16 s16, s18 +; CHECK-NEXT: vmovx.f16 s17, s2 +; CHECK-NEXT: vmovx.f16 s18, s3 +; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: vmov.f32 s20, s24 +; CHECK-NEXT: vins.f16 s17, s18 +; CHECK-NEXT: vmovx.f16 s18, s4 +; CHECK-NEXT: vmovx.f16 s22, s5 +; CHECK-NEXT: vmovx.f16 s19, s6 +; CHECK-NEXT: vmovx.f16 s23, s7 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s1, s25 +; CHECK-NEXT: vmovx.f16 s24, s24 +; CHECK-NEXT: vmov.f32 s21, s26 +; CHECK-NEXT: vins.f16 s20, s25 +; CHECK-NEXT: vins.f16 s18, s22 +; CHECK-NEXT: vmov.f32 s22, s28 +; CHECK-NEXT: vins.f16 s19, s23 +; CHECK-NEXT: vmov.f32 s23, s30 +; CHECK-NEXT: vins.f16 s24, s1 +; CHECK-NEXT: vmovx.f16 s25, s26 +; CHECK-NEXT: vmovx.f16 s1, s27 +; CHECK-NEXT: vins.f16 s21, s27 +; CHECK-NEXT: vins.f16 s25, s1 +; CHECK-NEXT: vmovx.f16 s26, s28 +; CHECK-NEXT: vmovx.f16 s1, s29 +; CHECK-NEXT: vins.f16 s22, s29 +; CHECK-NEXT: vins.f16 s23, s31 +; CHECK-NEXT: add r0, sp, #112 +; CHECK-NEXT: vins.f16 s26, s1 +; CHECK-NEXT: vmovx.f16 s1, s31 +; CHECK-NEXT: vmovx.f16 s27, s30 +; CHECK-NEXT: vsub.f16 q4, q5, q4 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: vins.f16 s27, s1 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmovx.f16 s28, s8 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vmovx.f16 s6, s23 +; CHECK-NEXT: vadd.f16 q0, q6, q0 +; CHECK-NEXT: vmovx.f16 s27, s22 +; CHECK-NEXT: vmovx.f16 s4, s16 +; CHECK-NEXT: vins.f16 s16, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vins.f16 s27, s6 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s5, s18 +; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vins.f16 s18, s2 +; CHECK-NEXT: vmovx.f16 s26, s20 +; CHECK-NEXT: vmovx.f16 s2, s21 +; CHECK-NEXT: vins.f16 s28, s6 +; CHECK-NEXT: vmovx.f16 s29, s10 +; CHECK-NEXT: vmovx.f16 s6, s11 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmov.f32 s0, s12 +; CHECK-NEXT: vins.f16 s26, s2 +; CHECK-NEXT: vmov.f32 s2, s14 +; CHECK-NEXT: vins.f16 s29, s6 +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vmovx.f16 s30, s12 +; CHECK-NEXT: add r0, sp, #96 +; CHECK-NEXT: vins.f16 s30, s6 +; CHECK-NEXT: vins.f16 s0, s13 +; CHECK-NEXT: vins.f16 s2, s15 +; CHECK-NEXT: vmovx.f16 s6, s15 +; CHECK-NEXT: vmovx.f16 s31, s14 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vins.f16 s31, s6 +; CHECK-NEXT: vins.f16 s10, s11 +; CHECK-NEXT: vmovx.f16 s24, s12 +; CHECK-NEXT: vmovx.f16 s6, s13 +; CHECK-NEXT: vmovx.f16 s25, s14 +; CHECK-NEXT: vins.f16 s24, s6 +; CHECK-NEXT: vmovx.f16 s6, s15 +; CHECK-NEXT: vins.f16 s14, s15 +; CHECK-NEXT: vins.f16 s22, s23 +; CHECK-NEXT: vins.f16 s20, s21 +; CHECK-NEXT: vins.f16 s12, s13 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vmov.f32 s9, s10 +; CHECK-NEXT: vins.f16 s25, s6 +; CHECK-NEXT: vmov.f32 s13, s14 +; CHECK-NEXT: vmovx.f16 s7, s19 +; CHECK-NEXT: vmov.f32 s10, s0 +; CHECK-NEXT: vins.f16 s19, s3 +; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vadd.f16 q2, q6, q2 +; CHECK-NEXT: vmov.f32 s15, s22 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vsub.f16 q5, q3, q7 +; CHECK-NEXT: vmovx.f16 s24, s17 +; CHECK-NEXT: vmovx.f16 s6, s20 +; CHECK-NEXT: vmovx.f16 s13, s22 +; CHECK-NEXT: vins.f16 s6, s0 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vins.f16 s13, s0 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vins.f16 s24, s0 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vmovx.f16 s26, s21 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vins.f16 s22, s10 +; CHECK-NEXT: vmovx.f16 s15, s23 +; CHECK-NEXT: vins.f16 s23, s11 +; CHECK-NEXT: vins.f16 s26, s0 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vins.f16 s20, s8 +; CHECK-NEXT: vins.f16 s21, s9 +; CHECK-NEXT: vins.f16 s17, s1 +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vins.f16 s15, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s4, s18 +; CHECK-NEXT: vmov.f32 s12, s22 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmov.f32 s6, s19 +; CHECK-NEXT: vmov.f32 s10, s21 +; CHECK-NEXT: vmov.f32 s14, s23 +; CHECK-NEXT: vmov.f32 s3, s24 +; CHECK-NEXT: vmov.f32 s11, s26 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> + %a.imag = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> + %b.real = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> + %b.imag = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> + %0 = fsub fast <16 x half> %b.real, %a.imag + %1 = fadd fast <16 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> + ret <32 x half> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll @@ -0,0 +1,351 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +define arm_aapcs_vfpcc <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: complex_mul_v2f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmul.f16 s6, s2, s0 +; CHECK-NEXT: vfma.f16 s6, s4, s8 +; CHECK-NEXT: vmul.f16 s8, s8, s2 +; CHECK-NEXT: vfnms.f16 s8, s4, s0 +; CHECK-NEXT: vins.f16 s8, s6 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> + %a.imag = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> + %b.real = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> + %b.imag = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> + %0 = fmul fast <1 x half> %b.imag, %a.real + %1 = fmul fast <1 x half> %b.real, %a.imag + %2 = fadd fast <1 x half> %1, %0 + %3 = fmul fast <1 x half> %b.real, %a.real + %4 = fmul fast <1 x half> %a.imag, %b.imag + %5 = fsub fast <1 x half> %3, %4 + %interleaved.vec = shufflevector <1 x half> %5, <1 x half> %2, <2 x i32> + ret <2 x half> %interleaved.vec +} + +define arm_aapcs_vfpcc <4 x half> @complex_mul_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: complex_mul_v4f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vmul.f16 q4, q3, q0 +; CHECK-NEXT: vfma.f16 q4, q1, q2 +; CHECK-NEXT: vmul.f16 q2, q2, q3 +; CHECK-NEXT: vneg.f16 q2, q2 +; CHECK-NEXT: vfma.f16 q2, q1, q0 +; CHECK-NEXT: vmovx.f16 s0, s16 +; CHECK-NEXT: vmovx.f16 s9, s8 +; CHECK-NEXT: vins.f16 s8, s16 +; CHECK-NEXT: vins.f16 s9, s0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> + %a.imag = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> + %b.real = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> + %b.imag = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> + %0 = fmul fast <2 x half> %b.imag, %a.real + %1 = fmul fast <2 x half> %b.real, %a.imag + %2 = fadd fast <2 x half> %1, %0 + %3 = fmul fast <2 x half> %b.real, %a.real + %4 = fmul fast <2 x half> %a.imag, %b.imag + %5 = fsub fast <2 x half> %3, %4 + %interleaved.vec = shufflevector <2 x half> %5, <2 x half> %2, <4 x i32> + ret <4 x half> %interleaved.vec +} + +define arm_aapcs_vfpcc <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: complex_mul_v8f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmul.f16 q2, q0, q1, #90 +; CHECK-NEXT: vcmla.f16 q2, q0, q1, #0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> + %a.imag = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> + %b.real = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> + %b.imag = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> + %0 = fmul fast <4 x half> %b.imag, %a.real + %1 = fmul fast <4 x half> %b.real, %a.imag + %2 = fadd fast <4 x half> %1, %0 + %3 = fmul fast <4 x half> %b.real, %a.real + %4 = fmul fast <4 x half> %a.imag, %b.imag + %5 = fsub fast <4 x half> %3, %4 + %interleaved.vec = shufflevector <4 x half> %5, <4 x half> %2, <8 x i32> + ret <8 x half> %interleaved.vec +} + +define arm_aapcs_vfpcc <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) { +; CHECK-LABEL: complex_mul_v16f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmovx.f16 s19, s6 +; CHECK-NEXT: vmovx.f16 s24, s7 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmovx.f16 s16, s0 +; CHECK-NEXT: vmovx.f16 s18, s1 +; CHECK-NEXT: vins.f16 s19, s24 +; CHECK-NEXT: vmovx.f16 s24, s8 +; CHECK-NEXT: vmovx.f16 s8, s11 +; CHECK-NEXT: vmovx.f16 s25, s10 +; CHECK-NEXT: vins.f16 s16, s18 +; CHECK-NEXT: vmovx.f16 s17, s2 +; CHECK-NEXT: vmovx.f16 s18, s3 +; CHECK-NEXT: vins.f16 s25, s8 +; CHECK-NEXT: vmovx.f16 s8, s13 +; CHECK-NEXT: vmovx.f16 s26, s12 +; CHECK-NEXT: vins.f16 s17, s18 +; CHECK-NEXT: vmovx.f16 s18, s4 +; CHECK-NEXT: vmovx.f16 s22, s5 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s1, s9 +; CHECK-NEXT: vins.f16 s26, s8 +; CHECK-NEXT: vmovx.f16 s8, s15 +; CHECK-NEXT: vmovx.f16 s27, s14 +; CHECK-NEXT: vins.f16 s18, s22 +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vins.f16 s24, s1 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vins.f16 s27, s8 +; CHECK-NEXT: vmov.f32 s21, s10 +; CHECK-NEXT: vmov.f32 s22, s12 +; CHECK-NEXT: vins.f16 s20, s9 +; CHECK-NEXT: vmov.f32 s23, s14 +; CHECK-NEXT: vins.f16 s21, s11 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vins.f16 s22, s13 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vins.f16 s23, s15 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vmul.f16 q1, q4, q6 +; CHECK-NEXT: vmul.f16 q2, q6, q0 +; CHECK-NEXT: vneg.f16 q3, q1 +; CHECK-NEXT: vfma.f16 q3, q5, q0 +; CHECK-NEXT: vfma.f16 q2, q5, q4 +; CHECK-NEXT: vmovx.f16 s4, s12 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s5, s14 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vins.f16 s12, s8 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmovx.f16 s8, s13 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmovx.f16 s7, s15 +; CHECK-NEXT: vins.f16 s15, s11 +; CHECK-NEXT: vins.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s0, s11 +; CHECK-NEXT: vins.f16 s14, s10 +; CHECK-NEXT: vins.f16 s13, s9 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vmov.f32 s2, s13 +; CHECK-NEXT: vmov.f32 s3, s8 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> + %a.imag = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> + %b.real = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + %b.imag = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + %0 = fmul fast <8 x half> %b.imag, %a.real + %1 = fmul fast <8 x half> %b.real, %a.imag + %2 = fadd fast <8 x half> %1, %0 + %3 = fmul fast <8 x half> %b.real, %a.real + %4 = fmul fast <8 x half> %a.imag, %b.imag + %5 = fsub fast <8 x half> %3, %4 + %interleaved.vec = shufflevector <8 x half> %5, <8 x half> %2, <16 x i32> + ret <16 x half> %interleaved.vec +} + +define arm_aapcs_vfpcc <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) { +; CHECK-LABEL: complex_mul_v32f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: add r0, sp, #96 +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: add r0, sp, #112 +; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmovx.f16 s16, s24 +; CHECK-NEXT: vmovx.f16 s18, s25 +; CHECK-NEXT: vins.f16 s16, s18 +; CHECK-NEXT: vmovx.f16 s17, s26 +; CHECK-NEXT: vmovx.f16 s18, s27 +; CHECK-NEXT: vmovx.f16 s19, s29 +; CHECK-NEXT: vins.f16 s17, s18 +; CHECK-NEXT: vmovx.f16 s18, s28 +; CHECK-NEXT: vins.f16 s18, s19 +; CHECK-NEXT: vmovx.f16 s19, s30 +; CHECK-NEXT: vmovx.f16 s8, s31 +; CHECK-NEXT: vmov.f32 s20, s0 +; CHECK-NEXT: vins.f16 s19, s8 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vmov.f32 s21, s2 +; CHECK-NEXT: vmov.f32 s22, s4 +; CHECK-NEXT: vins.f16 s20, s1 +; CHECK-NEXT: vmov.f32 s23, s6 +; CHECK-NEXT: vins.f16 s0, s8 +; CHECK-NEXT: vmovx.f16 s1, s2 +; CHECK-NEXT: vmovx.f16 s8, s3 +; CHECK-NEXT: vins.f16 s21, s3 +; CHECK-NEXT: vins.f16 s1, s8 +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vins.f16 s22, s5 +; CHECK-NEXT: vins.f16 s23, s7 +; CHECK-NEXT: vmovx.f16 s4, s7 +; CHECK-NEXT: vmovx.f16 s3, s6 +; CHECK-NEXT: vins.f16 s26, s27 +; CHECK-NEXT: vins.f16 s30, s31 +; CHECK-NEXT: vins.f16 s28, s29 +; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vins.f16 s24, s25 +; CHECK-NEXT: vmov.f32 s25, s26 +; CHECK-NEXT: vins.f16 s3, s4 +; CHECK-NEXT: vmul.f16 q2, q4, q5 +; CHECK-NEXT: vmov.f32 s26, s28 +; CHECK-NEXT: add r0, sp, #128 +; CHECK-NEXT: vmov.f32 s27, s30 +; CHECK-NEXT: vfma.f16 q2, q6, q0 +; CHECK-NEXT: vmul.f16 q0, q0, q4 +; CHECK-NEXT: vneg.f16 q4, q0 +; CHECK-NEXT: vmovx.f16 s0, s8 +; CHECK-NEXT: vfma.f16 q4, q6, q5 +; CHECK-NEXT: vmovx.f16 s20, s12 +; CHECK-NEXT: vmovx.f16 s4, s16 +; CHECK-NEXT: vmovx.f16 s5, s18 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s10 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmovx.f16 s0, s13 +; CHECK-NEXT: vins.f16 s16, s8 +; CHECK-NEXT: vins.f16 s18, s10 +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vins.f16 s20, s0 +; CHECK-NEXT: vmovx.f16 s21, s14 +; CHECK-NEXT: vmovx.f16 s0, s15 +; CHECK-NEXT: vmovx.f16 s22, s8 +; CHECK-NEXT: vins.f16 s21, s0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: add r0, sp, #144 +; CHECK-NEXT: vins.f16 s22, s6 +; CHECK-NEXT: vmovx.f16 s23, s10 +; CHECK-NEXT: vmovx.f16 s6, s11 +; CHECK-NEXT: vmov.f32 s24, s0 +; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: vins.f16 s23, s6 +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vins.f16 s24, s1 +; CHECK-NEXT: vins.f16 s0, s6 +; CHECK-NEXT: vmovx.f16 s6, s3 +; CHECK-NEXT: vmovx.f16 s1, s2 +; CHECK-NEXT: vmov.f32 s25, s2 +; CHECK-NEXT: vins.f16 s1, s6 +; CHECK-NEXT: vmovx.f16 s6, s29 +; CHECK-NEXT: vmovx.f16 s2, s28 +; CHECK-NEXT: vins.f16 s25, s3 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s10, s11 +; CHECK-NEXT: vins.f16 s14, s15 +; CHECK-NEXT: vmovx.f16 s6, s31 +; CHECK-NEXT: vmovx.f16 s3, s30 +; CHECK-NEXT: vmov.f32 s26, s28 +; CHECK-NEXT: vmov.f32 s27, s30 +; CHECK-NEXT: vins.f16 s12, s13 +; CHECK-NEXT: vmov.f32 s13, s14 +; CHECK-NEXT: vins.f16 s3, s6 +; CHECK-NEXT: vmov.f32 s14, s8 +; CHECK-NEXT: vins.f16 s26, s29 +; CHECK-NEXT: vmov.f32 s15, s10 +; CHECK-NEXT: vins.f16 s27, s31 +; CHECK-NEXT: vmul.f16 q7, q0, q3 +; CHECK-NEXT: vmul.f16 q0, q5, q0 +; CHECK-NEXT: vfma.f16 q7, q6, q5 +; CHECK-NEXT: vneg.f16 q5, q0 +; CHECK-NEXT: vfma.f16 q5, q6, q3 +; CHECK-NEXT: vmovx.f16 s0, s28 +; CHECK-NEXT: vmovx.f16 s6, s20 +; CHECK-NEXT: vmovx.f16 s13, s22 +; CHECK-NEXT: vins.f16 s6, s0 +; CHECK-NEXT: vmovx.f16 s0, s30 +; CHECK-NEXT: vins.f16 s13, s0 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmovx.f16 s24, s17 +; CHECK-NEXT: vmovx.f16 s7, s19 +; CHECK-NEXT: vmovx.f16 s0, s1 +; CHECK-NEXT: vmovx.f16 s26, s21 +; CHECK-NEXT: vins.f16 s24, s0 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vmovx.f16 s0, s29 +; CHECK-NEXT: vins.f16 s22, s30 +; CHECK-NEXT: vmovx.f16 s15, s23 +; CHECK-NEXT: vins.f16 s23, s31 +; CHECK-NEXT: vins.f16 s26, s0 +; CHECK-NEXT: vmovx.f16 s0, s31 +; CHECK-NEXT: vins.f16 s20, s28 +; CHECK-NEXT: vins.f16 s21, s29 +; CHECK-NEXT: vins.f16 s17, s1 +; CHECK-NEXT: vins.f16 s19, s3 +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vins.f16 s15, s0 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s4, s18 +; CHECK-NEXT: vmov.f32 s12, s22 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmov.f32 s6, s19 +; CHECK-NEXT: vmov.f32 s10, s21 +; CHECK-NEXT: vmov.f32 s14, s23 +; CHECK-NEXT: vmov.f32 s3, s24 +; CHECK-NEXT: vmov.f32 s11, s26 +; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> + %a.imag = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> + %b.real = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + %b.imag = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + %0 = fmul fast <16 x half> %b.imag, %a.real + %1 = fmul fast <16 x half> %b.real, %a.imag + %2 = fadd fast <16 x half> %1, %0 + %3 = fmul fast <16 x half> %b.real, %a.real + %4 = fmul fast <16 x half> %a.imag, %b.imag + %5 = fsub fast <16 x half> %3, %4 + %interleaved.vec = shufflevector <16 x half> %5, <16 x half> %2, <32 x i32> + ret <32 x half> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll @@ -0,0 +1,157 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + + +define arm_aapcs_vfpcc <2 x float> @complex_add_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: complex_add_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f32 s5, s5, s0 +; CHECK-NEXT: vsub.f32 s4, s4, s1 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x float> %b.real, %a.imag + %1 = fadd fast <1 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> + ret <2 x float> %interleaved.vec +} +define arm_aapcs_vfpcc <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: complex_add_v4f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcadd.f32 q2, q1, q0, #90 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x float> %b.real, %a.imag + %1 = fadd fast <2 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} +define arm_aapcs_vfpcc <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) { +; CHECK-LABEL: complex_add_v8f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov.f32 s4, s0 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s20, s9 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s9, s10 +; CHECK-NEXT: vmov.f32 s6, s16 +; CHECK-NEXT: vmov.f32 s7, s18 +; CHECK-NEXT: vmov.f32 s22, s13 +; CHECK-NEXT: vmov.f32 s23, s15 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vadd.f32 q1, q5, q1 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vsub.f32 q2, q2, q0 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s0, s8 +; CHECK-NEXT: vmov.f32 s2, s9 +; CHECK-NEXT: vmov.f32 s4, s10 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x float> %b.real, %a.imag + %1 = fadd fast <4 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + ret <8 x float> %interleaved.vec +} +define arm_aapcs_vfpcc <16 x float> @complex_add_v16f32(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: complex_add_v16f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: add r3, sp, #80 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: vldrw.u32 q6, [r2] +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov.f32 s4, s0 +; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: add r0, sp, #96 +; CHECK-NEXT: vmov.f32 s28, s25 +; CHECK-NEXT: add r1, sp, #112 +; CHECK-NEXT: vmov.f32 s29, s27 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s25, s26 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmov.f32 s26, s20 +; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vmov.f32 s6, s16 +; CHECK-NEXT: vmov.f32 s7, s18 +; CHECK-NEXT: vsub.f32 q4, q6, q0 +; CHECK-NEXT: vmov.f32 s30, s21 +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vmov.f32 s31, s23 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vadd.f32 q1, q7, q1 +; CHECK-NEXT: vmov.f32 s0, s16 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov.f32 s4, s18 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s6, s19 +; CHECK-NEXT: vmov.f32 s16, s8 +; CHECK-NEXT: vmov.f32 s17, s10 +; CHECK-NEXT: vmov.f32 s28, s25 +; CHECK-NEXT: vmov.f32 s29, s27 +; CHECK-NEXT: vmov.f32 s8, s9 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s25, s26 +; CHECK-NEXT: vmov.f32 s18, s12 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s30, s21 +; CHECK-NEXT: vmov.f32 s31, s23 +; CHECK-NEXT: vmov.f32 s10, s13 +; CHECK-NEXT: vadd.f32 q4, q7, q4 +; CHECK-NEXT: vmov.f32 s11, s15 +; CHECK-NEXT: vmov.f32 s26, s20 +; CHECK-NEXT: vmov.f32 s27, s22 +; CHECK-NEXT: vsub.f32 q3, q6, q2 +; CHECK-NEXT: vmov.f32 s9, s16 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vmov.f32 s17, s18 +; CHECK-NEXT: vmov.f32 s16, s14 +; CHECK-NEXT: vmov.f32 s18, s15 +; CHECK-NEXT: vmov.f32 s8, s12 +; CHECK-NEXT: vmov.f32 s10, s13 +; CHECK-NEXT: vmov q3, q4 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x float> %b.real, %a.imag + %1 = fadd fast <8 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> + ret <16 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll @@ -0,0 +1,191 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +define arm_aapcs_vfpcc <2 x float> @complex_mul_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: complex_mul_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.f32 s9, s5, s0 +; CHECK-NEXT: vmul.f32 s8, s1, s5 +; CHECK-NEXT: vfma.f32 s9, s4, s1 +; CHECK-NEXT: vfnms.f32 s8, s4, s0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> + %a.imag = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> + %b.real = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> + %b.imag = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> + %0 = fmul fast <1 x float> %b.imag, %a.real + %1 = fmul fast <1 x float> %b.real, %a.imag + %2 = fadd fast <1 x float> %1, %0 + %3 = fmul fast <1 x float> %b.real, %a.real + %4 = fmul fast <1 x float> %a.imag, %b.imag + %5 = fsub fast <1 x float> %3, %4 + %interleaved.vec = shufflevector <1 x float> %5, <1 x float> %2, <2 x i32> + ret <2 x float> %interleaved.vec +} + +define arm_aapcs_vfpcc <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: complex_mul_v4f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #90 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %a.real + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %a.real + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> + ret <4 x float> %interleaved.vec +} + +define arm_aapcs_vfpcc <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) { +; CHECK-LABEL: complex_mul_v8f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov.f32 s20, s0 +; CHECK-NEXT: vmov.f32 s21, s2 +; CHECK-NEXT: vmov.f32 s24, s9 +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f32 s22, s16 +; CHECK-NEXT: vmov.f32 s23, s18 +; CHECK-NEXT: vmov.f32 s26, s13 +; CHECK-NEXT: vmov.f32 s27, s15 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmul.f32 q1, q6, q5 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s9, s10 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vmov.f32 s10, s12 +; CHECK-NEXT: vmov.f32 s11, s14 +; CHECK-NEXT: vfma.f32 q1, q2, q0 +; CHECK-NEXT: vmul.f32 q0, q0, q6 +; CHECK-NEXT: vneg.f32 q3, q0 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vfma.f32 q3, q2, q5 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s0, s12 +; CHECK-NEXT: vmov.f32 s2, s13 +; CHECK-NEXT: vmov.f32 s4, s14 +; CHECK-NEXT: vmov.f32 s6, s15 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> + %a.imag = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> + %b.real = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + %b.imag = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + %0 = fmul fast <4 x float> %b.imag, %a.real + %1 = fmul fast <4 x float> %b.real, %a.imag + %2 = fadd fast <4 x float> %1, %0 + %3 = fmul fast <4 x float> %b.real, %a.real + %4 = fmul fast <4 x float> %a.imag, %b.imag + %5 = fsub fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> + ret <8 x float> %interleaved.vec +} + +define arm_aapcs_vfpcc <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: complex_mul_v16f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: add r2, sp, #96 +; CHECK-NEXT: add r3, sp, #112 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: vldrw.u32 q4, [r2] +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s24, s1 +; CHECK-NEXT: vmov.f32 s12, s17 +; CHECK-NEXT: add r0, sp, #144 +; CHECK-NEXT: vmov.f32 s13, s19 +; CHECK-NEXT: add r1, sp, #128 +; CHECK-NEXT: vmov.f32 s25, s3 +; CHECK-NEXT: vmov.f32 s14, s21 +; CHECK-NEXT: vmov.f32 s15, s23 +; CHECK-NEXT: vmov.f32 s26, s5 +; CHECK-NEXT: vmov.f32 s27, s7 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmul.f32 q7, q6, q3 +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vmul.f32 q1, q3, q0 +; CHECK-NEXT: vmov.f32 s17, s18 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vmov.f32 s19, s22 +; CHECK-NEXT: vneg.f32 q5, q7 +; CHECK-NEXT: vfma.f32 q5, q4, q0 +; CHECK-NEXT: vfma.f32 q1, q4, q6 +; CHECK-NEXT: vmov.f32 s0, s20 +; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s2, s21 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s10 +; CHECK-NEXT: vmov.f32 s28, s25 +; CHECK-NEXT: vmov.f32 s29, s27 +; CHECK-NEXT: vmov.f32 s8, s9 +; CHECK-NEXT: vmov.f32 s9, s11 +; CHECK-NEXT: vmov.f32 s30, s13 +; CHECK-NEXT: vmov.f32 s31, s15 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s11, s3 +; CHECK-NEXT: vmov.f32 s4, s22 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s6, s23 +; CHECK-NEXT: vmov.f32 s22, s0 +; CHECK-NEXT: vmov.f32 s23, s2 +; CHECK-NEXT: vmul.f32 q0, q2, q7 +; CHECK-NEXT: vmov.f32 s25, s26 +; CHECK-NEXT: vmul.f32 q4, q7, q5 +; CHECK-NEXT: vmov.f32 s26, s12 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vmov.f32 s27, s14 +; CHECK-NEXT: vfma.f32 q4, q6, q2 +; CHECK-NEXT: vfma.f32 q0, q6, q5 +; CHECK-NEXT: vmov.f32 s9, s16 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: vmov.f32 s17, s18 +; CHECK-NEXT: vmov.f32 s16, s2 +; CHECK-NEXT: vmov.f32 s18, s3 +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov q3, q4 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> + %a.imag = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> + %b.real = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + %b.imag = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + %0 = fmul fast <8 x float> %b.imag, %a.real + %1 = fmul fast <8 x float> %b.real, %a.imag + %2 = fadd fast <8 x float> %1, %0 + %3 = fmul fast <8 x float> %b.real, %a.real + %4 = fmul fast <8 x float> %a.imag, %b.imag + %5 = fsub fast <8 x float> %3, %4 + %interleaved.vec = shufflevector <8 x float> %5, <8 x float> %2, <16 x i32> + ret <16 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + + +define arm_aapcs_vfpcc <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_add_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f64 d3, d3, d0 +; CHECK-NEXT: vsub.f64 d2, d2, d1 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x double> %b.real, %a.imag + %1 = fadd fast <1 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> + ret <2 x double> %interleaved.vec +} +define arm_aapcs_vfpcc <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_add_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f64 d5, d5, d0 +; CHECK-NEXT: vsub.f64 d4, d4, d1 +; CHECK-NEXT: vadd.f64 d7, d7, d2 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vsub.f64 d6, d6, d3 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x double> %b.real, %a.imag + %1 = fadd fast <2 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> + ret <4 x double> %interleaved.vec +} +define arm_aapcs_vfpcc <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_add_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #48 +; CHECK-NEXT: vadd.f64 d1, d1, d2 +; CHECK-NEXT: vsub.f64 d0, d0, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: vadd.f64 d3, d3, d8 +; CHECK-NEXT: vsub.f64 d2, d2, d9 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #80 +; CHECK-NEXT: vadd.f64 d9, d9, d4 +; CHECK-NEXT: vsub.f64 d8, d8, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vadd.f64 d11, d5, d6 +; CHECK-NEXT: vsub.f64 d10, d4, d7 +; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x double> %b.real, %a.imag + %1 = fadd fast <4 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +define arm_aapcs_vfpcc <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_mul_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.f64 d5, d3, d0 +; CHECK-NEXT: vmul.f64 d4, d1, d3 +; CHECK-NEXT: vfma.f64 d5, d2, d1 +; CHECK-NEXT: vfnms.f64 d4, d2, d0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %0 = fmul fast <1 x double> %b.imag, %a.real + %1 = fmul fast <1 x double> %b.real, %a.imag + %2 = fadd fast <1 x double> %1, %0 + %3 = fmul fast <1 x double> %b.real, %a.real + %4 = fmul fast <1 x double> %a.imag, %b.imag + %5 = fsub fast <1 x double> %3, %4 + %interleaved.vec = shufflevector <1 x double> %5, <1 x double> %2, <2 x i32> + ret <2 x double> %interleaved.vec +} + +define arm_aapcs_vfpcc <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_mul_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmul.f64 d9, d7, d2 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmul.f64 d8, d3, d7 +; CHECK-NEXT: vfma.f64 d9, d6, d3 +; CHECK-NEXT: vfnms.f64 d8, d6, d2 +; CHECK-NEXT: vmul.f64 d1, d5, d10 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmul.f64 d0, d11, d5 +; CHECK-NEXT: vfma.f64 d1, d4, d11 +; CHECK-NEXT: vfnms.f64 d0, d4, d10 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %b.imag, %a.real + %1 = fmul fast <2 x double> %b.real, %a.imag + %2 = fadd fast <2 x double> %1, %0 + %3 = fmul fast <2 x double> %b.real, %a.real + %4 = fmul fast <2 x double> %a.imag, %b.imag + %5 = fsub fast <2 x double> %3, %4 + %interleaved.vec = shufflevector <2 x double> %5, <2 x double> %2, <4 x i32> + ret <4 x double> %interleaved.vec +} + +define arm_aapcs_vfpcc <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_mul_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: add r0, sp, #128 +; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #160 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov q6, q0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: add r0, sp, #176 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d5, d3, d0 +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d4, d1, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: add r0, sp, #144 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d11, d3, d0 +; CHECK-NEXT: vmul.f64 d10, d1, d3 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmul.f64 d7, d9, d12 +; CHECK-NEXT: vmul.f64 d2, d15, d1 +; CHECK-NEXT: vmul.f64 d3, d1, d14 +; CHECK-NEXT: vmul.f64 d6, d13, d9 +; CHECK-NEXT: vfma.f64 d7, d8, d13 +; CHECK-NEXT: vfnms.f64 d6, d8, d12 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d3, d0, d15 +; CHECK-NEXT: vfnms.f64 d2, d0, d14 +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d5, d0, d9 +; CHECK-NEXT: vfnms.f64 d4, d0, d8 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d11, d0, d9 +; CHECK-NEXT: vfnms.f64 d10, d0, d8 +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %0 = fmul fast <4 x double> %b.imag, %a.real + %1 = fmul fast <4 x double> %b.real, %a.imag + %2 = fadd fast <4 x double> %1, %0 + %3 = fmul fast <4 x double> %b.real, %a.real + %4 = fmul fast <4 x double> %a.imag, %b.imag + %5 = fsub fast <4 x double> %3, %4 + %interleaved.vec = shufflevector <4 x double> %5, <4 x double> %2, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Transform functions to use DSP intrinsics +; CHECK-NEXT: Complex Arithmetic Pass ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: Type Promotion ; CHECK-NEXT: CodeGen Prepare