diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -0,0 +1,40 @@ +//===- ComplexDeinterleavingPass.h - Complex Deinterleaving Pass *- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements generation of target-specific intrinsics to support +// handling of complex number arithmetic and deinterleaving. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H +#define LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H + +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" + +namespace llvm { + +class Function; +class TargetMachine; + +struct ComplexDeinterleavingPass + : public PassInfoMixin { +private: + TargetMachine *TM; + +public: + ComplexDeinterleavingPass(TargetMachine *TM) : TM(TM) {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +enum class ComplexDeinterleavingOperation { None, CAdd, CMulPartial }; + +} // namespace llvm + +#endif // LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -80,6 +80,10 @@ /// matching during instruction selection. FunctionPass *createCodeGenPreparePass(); + /// This pass implements generation of target-specific intrinsics to support + /// handling of complex number arithmetic + FunctionPass *createComplexDeinterleavingPass(const TargetMachine *TM); + /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg /// load-linked/store-conditional loops. extern char &AtomicExpandID; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -22,6 +22,7 @@ #ifndef LLVM_CODEGEN_TARGETLOWERING_H #define LLVM_CODEGEN_TARGETLOWERING_H +#include "ComplexDeinterleavingPass.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -3049,6 +3050,27 @@ return isOperationLegalOrCustom(Op, VT); } + /// Does this target support complex deinterleaving + virtual bool isComplexDeinterleavingSupported() const { return false; } + + /// Does this target support complex deinterleaving with the given operation + /// and type + virtual bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + return false; + } + + /// Create the IR node for the given complex deinterleaving operation. + /// If one cannot be created using all the given inputs, nullptr should be + /// returned. + virtual Value * + createComplexDeinterleavingIR(Instruction *I, + ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const { + return nullptr; + } + //===--------------------------------------------------------------------===// // Runtime Library hooks // diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -101,26 +101,27 @@ void initializeCFIFixupPass(PassRegistry&); void initializeCFIInstrInserterPass(PassRegistry&); void initializeCFLAndersAAWrapperPassPass(PassRegistry&); -void initializeCFLSteensAAWrapperPassPass(PassRegistry&); +void initializeCFLSteensAAWrapperPassPass(PassRegistry &); void initializeCGProfileLegacyPassPass(PassRegistry &); -void initializeCallGraphDOTPrinterPass(PassRegistry&); -void initializeCallGraphPrinterLegacyPassPass(PassRegistry&); -void initializeCallGraphViewerPass(PassRegistry&); -void initializeCallGraphWrapperPassPass(PassRegistry&); -void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); +void initializeCallGraphDOTPrinterPass(PassRegistry &); +void initializeCallGraphPrinterLegacyPassPass(PassRegistry &); +void initializeCallGraphViewerPass(PassRegistry &); +void initializeCallGraphWrapperPassPass(PassRegistry &); +void initializeCallSiteSplittingLegacyPassPass(PassRegistry &); void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); void initializeCheckDebugMachineModulePass(PassRegistry &); -void initializeCodeGenPreparePass(PassRegistry&); -void initializeConstantHoistingLegacyPassPass(PassRegistry&); -void initializeConstantMergeLegacyPassPass(PassRegistry&); +void initializeCodeGenPreparePass(PassRegistry &); +void initializeComplexDeinterleavingLegacyPassPass(PassRegistry &); +void initializeConstantHoistingLegacyPassPass(PassRegistry &); +void initializeConstantMergeLegacyPassPass(PassRegistry &); void initializeConstraintEliminationPass(PassRegistry &); -void initializeControlHeightReductionLegacyPassPass(PassRegistry&); -void initializeCorrelatedValuePropagationPass(PassRegistry&); -void initializeCostModelAnalysisPass(PassRegistry&); -void initializeCrossDSOCFIPass(PassRegistry&); +void initializeControlHeightReductionLegacyPassPass(PassRegistry &); +void initializeCorrelatedValuePropagationPass(PassRegistry &); +void initializeCostModelAnalysisPass(PassRegistry &); +void initializeCrossDSOCFIPass(PassRegistry &); void initializeCycleInfoWrapperPassPass(PassRegistry &); -void initializeDAEPass(PassRegistry&); -void initializeDAHPass(PassRegistry&); +void initializeDAEPass(PassRegistry &); +void initializeDAHPass(PassRegistry &); void initializeDCELegacyPassPass(PassRegistry&); void initializeDFAJumpThreadingLegacyPassPass(PassRegistry &); void initializeDSELegacyPassPass(PassRegistry&); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -46,6 +46,7 @@ CodeGenPassBuilder.cpp CodeGenPrepare.cpp CommandFlags.cpp + ComplexDeinterleavingPass.cpp CriticalAntiDepBreaker.cpp DeadMachineInstructionElim.cpp DetectDeadLanes.cpp diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -0,0 +1,1036 @@ +//===- ComplexDeinterleavingPass.cpp +//------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ComplexDeinterleavingPass.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "complex-deinterleaving" + +STATISTIC(NumComplexIntrinsics, "Number of complex intrinsics generated"); + +static cl::opt ComplexArithmeticEnabled( + "enable-complex-arithmetic", + cl::desc("Enable generation of complex arithmetic instructions"), + cl::init(true), cl::Hidden); + +static bool isInterleavingMask(ArrayRef Mask); +static bool isDeinterleavingMask(ArrayRef Mask); + +namespace { + +/** + * Creates an integer array of length \p len, where each item is \p step more + * than the previous. An offset can be provided to specify the first element. + */ +static SmallVector createArrayWithStep(int len, int step, int offset = 0) { + SmallVector Arr(len); + for (int j = 0; j < len; j++) + Arr[j] = (j * step) + offset; + return Arr; +} + +/** + * Creates a deinterleaving mask of the given length at the given offset. + * A deinterleaving mask looks like <0, 2, 4, 6> or <1, 3, 5, 7> + */ +static SmallVector createDeinterleavingMask(int len, int offset = 0) { + return createArrayWithStep(len, 2, offset); +} + +class ComplexDeinterleavingLegacyPass : public FunctionPass { +public: + static char ID; + + ComplexDeinterleavingLegacyPass(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) { + initializeComplexDeinterleavingLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "Complex Arithmetic Pass"; } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } + +private: + const TargetMachine *TM; +}; + +enum OperatingComponent { Real, Imaginary, Unknown }; + +class ComplexDeinterleavingGraph; +struct ComplexDeinterleavingCompositeNode { + +private: + ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op) + : Operation(Op) {} + + friend class ComplexDeinterleavingGraph; + +public: + SmallVector getOperands() { + SmallVector Ops; + + for (const auto &item : ContainedInstructions) { + for (unsigned i = 0; i < item->getNumOperands(); i++) { + auto *V = item->getOperand(i); + auto *I = dyn_cast(V); + if (!I || !contains(I)) { + Ops.push_back(V); + continue; + } + } + } + return Ops; + } + + Value *getOperand(unsigned Idx) { return getOperands()[Idx]; } + + unsigned getNumOperands() { return getOperands().size(); } + + SmallVector ContainedInstructions; + Value *OutputNode = nullptr; + Value *OriginalInput0 = nullptr; + Value *OriginalInput1 = nullptr; + Value *ReplacementNode = nullptr; + bool IsTopLevel = false; + ComplexDeinterleavingOperation Operation; + + bool UsesNegation = false; + unsigned Rotation = 0; + Value *Input0 = nullptr; + Value *Input1 = nullptr; + Value *Accumulator = nullptr; + Value *Accumulatee = nullptr; + + void addInstruction(Instruction *I) { ContainedInstructions.push_back(I); } + bool contains(Instruction *I) { + if (I == ReplacementNode) + return true; + + return std::find(ContainedInstructions.begin(), ContainedInstructions.end(), + I) != ContainedInstructions.end(); + } +}; + +class ComplexDeinterleavingGraph { +private: + using NodePtr = std::shared_ptr; + + SmallVector Instructions; + SmallVector CompositeNodes; + + llvm::TargetTransformInfo::TargetCostKind CostKind = + llvm::TargetTransformInfo::TCK_Latency; + + InstructionCost CostOfIntrinsics; + + /// Determines the operating component of the given Value. + /// This is achieved by looking at the operating component of the Value's + /// operands and, based on the instruction, evaluates what the resulting + /// component would be. + OperatingComponent getOperatingComponentOfValue(Value *V) { + Instruction *I = dyn_cast_or_null(V); + if (!I) + return Unknown; + + if (auto *Shuffle = dyn_cast(I)) { + auto ShuffleMask = Shuffle->getShuffleMask(); + if (isDeinterleavingMask(ShuffleMask)) { + if (ShuffleMask[0] == 0) + return Real; + if (ShuffleMask[0] == 1) + return Imaginary; + } + return Unknown; + } + + if (I->getOpcode() == Instruction::FMul) { + auto Op0Component = getOperatingComponentOfValue(I->getOperand(0)); + auto Op1Component = getOperatingComponentOfValue(I->getOperand(1)); + if (Op0Component == Unknown || Op1Component == Unknown) + return Unknown; + if (Op0Component == Op1Component) + return Real; + return Imaginary; + } + + if (I->getOpcode() == Instruction::FNeg) + return getOperatingComponentOfValue(I->getOperand(0)); + + if (I->getOpcode() == Instruction::FAdd || + I->getOpcode() == Instruction::FSub) { + auto Op0Component = getOperatingComponentOfValue(I->getOperand(0)); + auto Op1Component = getOperatingComponentOfValue(I->getOperand(1)); + if (Op0Component != Op1Component || Op1Component == Unknown) + return Unknown; + return Op0Component; + } + + return Unknown; + } + + void addInstruction(Instruction *I) { Instructions.push_back(I); } + + void sortCompositeNodes(BasicBlock *B) { + SmallVector NewNodeList; + + // Sort the nodelist based on the instruction order + for (auto &I : *B) { + if (auto CN = findNodeFromOutput(&I)) + NewNodeList.push_back(CN); + } + + for (unsigned i = 0; i < NewNodeList.size(); i++) + CompositeNodes[i] = NewNodeList[i]; + } + + NodePtr findNodeFromOutput(Instruction *I) { + for (const auto &Item : CompositeNodes) { + if (Item->OutputNode == I) + return Item; + } + + return nullptr; + } + + SmallVector findUnmatchedInstructions() { + SmallVector Is; + for (auto &I : Instructions) { + if (shouldIgnoreValue(I)) + continue; + if (getContainingComposite(I) == nullptr) + Is.push_back(I); + } + return Is; + } + + Value *getSharedOperand(Instruction *A, Instruction *B, unsigned &Idx) { + if (A->getNumOperands() != B->getNumOperands()) + return nullptr; + + for (unsigned i = 0; i < A->getNumOperands(); i++) { + auto *Op = A->getOperand(i); + if (Op == B->getOperand(i)) { + Idx = i; + return Op; + } + } + return nullptr; + } + + bool haveSharedUses(Value *A, Value *B) { + if (A->hasOneUser() && B->hasOneUser()) { + auto *AUser = *A->user_begin(); + auto *BUser = *B->user_begin(); + + if (AUser && AUser == BUser) + return true; + + auto AUCN = getContainingComposite(dyn_cast(AUser)); + auto BUCN = getContainingComposite(dyn_cast(BUser)); + + if (AUCN && AUCN == BUCN) + return true; + } + + if (A->getNumUses() != B->getNumUses()) + return false; + + for (const auto &AUser : A->users()) { + bool Found = false; + auto AUCN = getContainingComposite(dyn_cast(AUser)); + for (const auto &BUser : B->users()) { + if (AUser == BUser) { + Found = true; + break; + } + auto BUCN = getContainingComposite(dyn_cast(BUser)); + if (AUCN && AUCN == BUCN) { + Found = true; + break; + } + } + if (!Found) { + LLVM_DEBUG(dbgs() << "AUser doesn't have a match: "; AUser->dump()); + return false; + } + } + + return true; + } + + Value *followUseChain(Value *V) { + if (V->hasOneUser()) + return followUseChain(*V->user_begin()); + + // TODO handle multiple users, but how? + + return V; + } + + Value *getFinalInputReplacement(Instruction *I) { + for (unsigned OpIdx = I->getNumOperands() - 1; OpIdx >= 0; OpIdx--) { + auto *Op = dyn_cast(I->getOperand(OpIdx)); + while (Op && shouldIgnoreValue(Op)) + Op = dyn_cast(Op->getOperand(0)); + if (Op == nullptr) + continue; + + auto CN = getContainingComposite(Op); + if (CN == nullptr || CN->ReplacementNode == nullptr) + continue; + return followUseChain(CN->ReplacementNode); + } + + return nullptr; + } + + Value *getReplacement(Instruction *I) { + if (!I) + return nullptr; + auto CN = getContainingComposite(I); + if (CN == nullptr || CN->ReplacementNode == nullptr) + return I; + return CN->ReplacementNode; + } + + std::shared_ptr + prepareCompositeNode(ComplexDeinterleavingOperation Operation) { + return std::shared_ptr( + new ComplexDeinterleavingCompositeNode(Operation)); + } + + void + submitCompositeNode(std::shared_ptr CN) { + CompositeNodes.push_back(CN); + } + + bool containsNode(Instruction *I) { + return std::find(Instructions.begin(), Instructions.end(), I) != + Instructions.end(); + } + + /// Certain values, such as extends and truncates, should be ignored within + /// the graph for our needs as they contribute towards structure rather than + /// function. + /// + /// e.g. A deinterleaving shuffle provides no functionality itself, + /// and does not need to be explicitly handled beyond the usual operations. A + /// shuffle that is neither interleaving nor deinterleaving is an example of + /// one that needs to be handled, and thus should not be ignored. + bool shouldIgnoreValue(Value *V) { + if (isa(V)) + return true; + + if (auto *SVI = dyn_cast(V)) { + auto Mask = SVI->getShuffleMask(); + return isInterleavingMask(Mask) || isDeinterleavingMask(Mask); + } + + if (auto *I = dyn_cast(V)) { + auto Opc = I->getOpcode(); + return I->isCast() || Opc == Instruction::FPTrunc || + Opc == Instruction::FPExt; + } + return false; + } + + /// Checks the users of the given instructions to evaluate whether the + /// returns from said instructions converge at any point. e.g. in a shuffle + bool doInstructionsConverge(Instruction *A, Instruction *B) { + if (A->hasOneUser() && B->hasOneUser()) { + auto *AUser = *A->user_begin(); + auto *BUser = *B->user_begin(); + + while (shouldIgnoreValue(AUser)) + AUser = *AUser->user_begin(); + while (shouldIgnoreValue(BUser)) + BUser = *BUser->user_begin(); + + if (AUser == BUser) + return true; + } + + return haveSharedUses(A, B); + } + + NodePtr getContainingComposite(Instruction *I) { + if (I == nullptr) + return nullptr; + for (const auto &CN : CompositeNodes) { + if (CN->contains(I)) + return CN; + if (CN->ReplacementNode == I) + return CN; + } + return nullptr; + } + +public: + /// Step through the use-def chains to find all instruction nodes converging + /// on \p I. + void discoverNodes(BasicBlock *B, Instruction *I); + /// Iterate over the nodes and reducing them to complex nodes where possible. + /// Returns false if the deinterleaving operation should be cancelled for the + /// current graph. + bool reduceNodes(const TargetLowering *TL); + /// Perform the actual replacement of the underlying instruction graph. + /// Returns false if the deinterleaving operation should be cancelled for the + /// current graph. + bool replaceNodes(const TargetLowering *TL); + void getDeadRoots(SmallVector &DeadInstRoots); +}; + +class ComplexDeinterleaving { +public: + ComplexDeinterleaving(const TargetLowering *tl, const TargetLibraryInfo *tli) + : TL(tl), TLI(tli) {} + bool runOnFunction(Function &F); + +private: + bool evaluateComplexDeinterleavingBasicBlock(BasicBlock *B); + + const TargetLowering *TL = nullptr; + const TargetLibraryInfo *TLI = nullptr; +}; + +} // namespace + +char ComplexDeinterleavingLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, + "Complex Deinterleaving", false, false) +INITIALIZE_PASS_END(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, + "Complex Deinterleaving", false, false) + +PreservedAnalyses ComplexDeinterleavingPass::run(Function &F, + FunctionAnalysisManager &AM) { + const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + auto &TLI = AM.getResult(F); + if (!ComplexDeinterleaving(TL, &TLI).runOnFunction(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} + +FunctionPass *llvm::createComplexDeinterleavingPass(const TargetMachine *TM) { + return new ComplexDeinterleavingLegacyPass(TM); +} + +bool ComplexDeinterleavingLegacyPass::runOnFunction(Function &F) { + const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + auto TLI = getAnalysis().getTLI(F); + return ComplexDeinterleaving(TL, &TLI).runOnFunction(F); +} + +bool ComplexDeinterleaving::runOnFunction(Function &F) { + if (!ComplexArithmeticEnabled) { + LLVM_DEBUG(dbgs() << "Complex has been explicitly disabled.\n"); + return false; + } + + if (!TL->isComplexDeinterleavingSupported()) { + LLVM_DEBUG(dbgs() << "Complex has been disabled, target does not support " + "lowering of complex numbers.\n"); + return false; + } + + bool Changed = false; + for (auto &B : F) + Changed |= evaluateComplexDeinterleavingBasicBlock(&B); + + return Changed; +} + +/** + * Checks the given mask, and determines whether said mask is interleaving. + * + * To be interleaving, a mask must alternate between `i` and `i + (Length / 2)`, + * and must contain all numbers within the range of `[0..Length)` + * (e.g. a 4x vector interleaving mask would be <0, 2, 1, 3>). + */ +static bool isInterleavingMask(ArrayRef Mask) { + int HalfNumElements = Mask.size() / 2; + + for (int Idx = 0; Idx < HalfNumElements; ++Idx) { + if (Mask[(Idx * 2) + 1] != (Mask[Idx * 2] + HalfNumElements)) + return false; + } + + return true; +} + +/** + * Checks the given mask, and determines whether said mask is deinterleaving. + * + * To be interleaving, a mask must increment in steps of 2, and either start + * with 0 or 1. + * (e.g. an 8x vector deinterleaving mask would be either <0, 2, 4, 6> or + * <1, 3, 5, 7>). + */ +static bool isDeinterleavingMask(ArrayRef Mask) { + int Offset = Mask[0]; + int HalfNumElements = Mask.size() / 2; + + for (int Idx = 1; Idx < HalfNumElements; ++Idx) { + if (Mask[Idx] != (Idx * 2) + Offset) + return false; + } + + return true; +} + +bool ComplexDeinterleaving::evaluateComplexDeinterleavingBasicBlock( + BasicBlock *B) { + bool Changed = false; + + SmallVector DeadInstrRoots; + + for (auto &I : *B) { + if (auto *SVI = dyn_cast(&I)) { + // Look for a shufflevector that takes separate vectors of the real and + // imaginary components and recombines them into a single vector. + if (isInterleavingMask(SVI->getShuffleMask())) { + ComplexDeinterleavingGraph Graph; + Graph.discoverNodes(B, SVI); + if (Graph.reduceNodes(TL) && Graph.replaceNodes(TL)) { + Changed = true; + DeadInstrRoots.push_back(SVI); + } else { + SmallVector DeadInstrs; + Graph.getDeadRoots(DeadInstrs); + for (auto It = DeadInstrs.rbegin(); It != DeadInstrs.rend(); It++) + (*It)->eraseFromParent(); + } + } + } + } + + for (const auto &I : DeadInstrRoots) + llvm::RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + + return Changed; +} + +void ComplexDeinterleavingGraph::discoverNodes(BasicBlock *B, Instruction *I) { + + if (I->getParent() != B) + return; + + if (containsNode(I)) + return; + + if (isa(I) || isa(I)) { + // No need to discover beyond a load or a phi. + return; + } + + addInstruction(I); + + if (auto *SVI = dyn_cast(I)) { + auto ShuffleMask = SVI->getShuffleMask(); + + auto RealMask = createDeinterleavingMask(ShuffleMask.size()); + auto ImagMask = createDeinterleavingMask(ShuffleMask.size(), 1); + ArrayRef RealMaskRef(RealMask); + ArrayRef ImagMaskRef(ImagMask); + + Value *ShuffleSource; + if (match(SVI, m_Shuffle(m_Value(ShuffleSource), m_Poison(), + m_SpecificMask(RealMaskRef))) || + match(SVI, m_Shuffle(m_Value(ShuffleSource), m_Poison(), + m_SpecificMask(ImagMaskRef)))) { + // Reached "top" of graph, stop discovering. + // TODO this check needs refining + if (ShuffleSource && + (isa(ShuffleSource) || !isa(ShuffleSource))) + return; + } + } + + for (const auto &Op : I->operands()) { + if (auto *OpI = dyn_cast(Op)) + discoverNodes(B, OpI); + } +} + +bool ComplexDeinterleavingGraph::reduceNodes(const TargetLowering *TL) { + if (Instructions.empty()) { + LLVM_DEBUG(dbgs() << "No Instructions, cannot reduce.\n"); + return false; + } + + auto *ConvergingI = Instructions[0]; + + for (auto &I : *ConvergingI->getParent()) { + + if (!containsNode(&I)) + continue; + + auto *N = &I; + if ((match(N, m_FAdd(m_FMul(m_Value(), m_Value()), + m_FMul(m_Value(), m_Value()))) || + match(N, m_FSub(m_FMul(m_Value(), m_Value()), + m_FMul(m_Value(), m_Value()))))) { + + auto *VTy = dyn_cast(N->getType()); + if (!VTy) + continue; + + auto *NewVTy = + FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + + if (!TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy)) { + dbgs() << "Type isn't valid for CMulPartial: "; + NewVTy->dump(); + continue; + } + + LLVM_DEBUG(dbgs() << "Composite node built up from "; N->dump()); + auto CN = prepareCompositeNode( + llvm::ComplexDeinterleavingOperation::CMulPartial); + + auto *Op0 = cast(N->getOperand(0)); + auto *Op1 = cast(N->getOperand(1)); + + CN->addInstruction(N); + CN->addInstruction(Op0); + CN->addInstruction(Op1); + + CN->OriginalInput0 = Op0; + CN->OriginalInput1 = Op1; + + bool ContainsNeg = false; + for (unsigned i = 0; i < Op0->getNumOperands(); i++) { + auto *Op = dyn_cast(Op0->getOperand(i)); + if (Op && Op->getOpcode() == Instruction::FNeg) { + if (ContainsNeg) + break; + CN->addInstruction(Op); + CN->OriginalInput0 = Op; + ContainsNeg = true; + } + } + for (unsigned i = 0; i < Op1->getNumOperands(); i++) { + auto *Op = dyn_cast(Op1->getOperand(i)); + if (Op && Op->getOpcode() == Instruction::FNeg) { + if (ContainsNeg) + break; + CN->addInstruction(Op); + CN->OriginalInput1 = Op; + ContainsNeg = true; + } + } + + if (!ContainsNeg) { + auto &Use = (*N->use_begin()); + if (N->getOpcode() == Instruction::FSub) { + if (isa(Use.getUser()) && + Use.getOperandNo() != 0) { + LLVM_DEBUG(dbgs() + << "First converging shuffle operand should be an FSub" + << ".\n"); + return false; + } + } else if (N->getOpcode() == Instruction::FAdd) { + if (isa(Use.getUser()) && + Use.getOperandNo() != 1) { + LLVM_DEBUG(dbgs() + << "Second converging shuffle operand should be an FAdd" + << ".\n"); + return false; + } + } + } + + auto Pattern = m_BinOp(m_Shuffle(m_Value(), m_Value()), + m_Shuffle(m_Value(), m_Value())); + CN->IsTopLevel = match(CN->OriginalInput0, Pattern) && + match(CN->OriginalInput1, Pattern); + CN->UsesNegation = ContainsNeg; + CN->OutputNode = N; + + CN->Rotation = (N->getOpcode() == Instruction::FAdd) * 90; + + if (N->getOpcode() == Instruction::FSub) { + auto *SubOp0 = cast(N->getOperand(0)); + auto SubOp0C0 = getOperatingComponentOfValue(SubOp0->getOperand(0)); + auto SubOp0C1 = getOperatingComponentOfValue(SubOp0->getOperand(1)); + + if (SubOp0C0 == SubOp0C1) { + if (SubOp0C0 == OperatingComponent::Imaginary) { + CN->Rotation += 90; + } + } + } + + if (CN->UsesNegation) + CN->Rotation += 180; + + submitCompositeNode(CN); + } + } + + auto Unmatched = findUnmatchedInstructions(); + SmallVector Pairs; + + for (auto &I : Unmatched) { + if (std::find(Pairs.begin(), Pairs.end(), I) != Pairs.end()) + continue; + for (auto &J : Unmatched) { + if (I == J || std::find(Pairs.begin(), Pairs.end(), J) != Pairs.end()) + continue; + + if (doInstructionsConverge(I, J)) { + Pairs.push_back(I); + Pairs.push_back(J); + break; + } + } + } + + // Try match found pairs + for (unsigned i = 0; i < Pairs.size(); i += 2) { + auto *I = Pairs[i]; + auto *J = Pairs[i + 1]; + + if ((I->getOpcode() == Instruction::FMul && + J->getOpcode() == Instruction::FMul)) { + + // At this point, all operands should be instructions + if (!isa(I->getOperand(0)) || + !isa(I->getOperand(1))) + continue; + if (!isa(J->getOperand(0)) || + !isa(J->getOperand(1))) + continue; + + auto *VTy = dyn_cast(I->getType()); + if (!VTy) + continue; + + auto *NewVTy = + FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + + if (!TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy)) + continue; + + // Partial mul + auto CN = prepareCompositeNode( + llvm::ComplexDeinterleavingOperation::CMulPartial); + CN->addInstruction(I); + CN->addInstruction(J); + CN->OriginalInput0 = I; + CN->OriginalInput1 = J; + + unsigned SharedIdx; + auto *SharedOp = + dyn_cast_or_null(getSharedOperand(I, J, SharedIdx)); + if (SharedOp) { + auto Opc = SharedOp->getOpcode(); + if (Opc == Instruction::FNeg) { + if (SharedIdx == 0) + CN->OriginalInput0 = SharedOp; + else if (SharedIdx == 1) + CN->OriginalInput1 = SharedOp; + else { + LLVM_DEBUG(dbgs() << "Unknown input pattern, somehow the shared " + "operand index is greater than 1.\n"); + return false; + } + CN->addInstruction(SharedOp); + CN->UsesNegation = true; + } + } + + auto Pattern = m_BinOp(m_Shuffle(m_Value(), m_Value()), + m_Shuffle(m_Value(), m_Value())); + CN->IsTopLevel = match(CN->OriginalInput0, Pattern) && + match(CN->OriginalInput1, Pattern); + + CN->OutputNode = J; + submitCompositeNode(CN); + continue; + } + + if (((I->getOpcode() == Instruction::FSub && + J->getOpcode() == Instruction::FAdd) || + (I->getOpcode() == Instruction::FAdd && + J->getOpcode() == Instruction::FSub))) { + + auto *VTy = dyn_cast(I->getType()); + if (!VTy) + continue; + + auto *NewVTy = + FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + + if (!TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CAdd, NewVTy)) + continue; + + LLVM_DEBUG(dbgs() << "Pairing instructions as a CAdd.\n"); + auto CN = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd); + CN->addInstruction(I); + CN->addInstruction(J); + CN->OriginalInput0 = I; + CN->OriginalInput1 = J; + + auto *Sub = I->getOpcode() == Instruction::FSub ? I : J; + bool IsLikelyNegated = false; + if (auto *Shuffle = dyn_cast(Sub->getOperand(0))) { + auto ShuffleMask = Shuffle->getShuffleMask(); + if (isDeinterleavingMask(ShuffleMask)) + IsLikelyNegated = ShuffleMask[0] == 1; + } + + if (IsLikelyNegated) { + LLVM_DEBUG(dbgs() << "Negated adds are not yet supported.\n"); + return false; + } + + CN->UsesNegation = IsLikelyNegated; + CN->Rotation = 90; + if (I->getOpcode() == Instruction::FAdd) + CN->Rotation = 270; + CN->OutputNode = J; + submitCompositeNode(CN); + continue; + } + } + + auto UnmatchedInstructions = findUnmatchedInstructions(); + if (!UnmatchedInstructions.empty()) { + LLVM_DEBUG(dbgs() << "Unmatched instructions found in graph, cannot " + "confidently generate complex intrinsics.\n";); + return false; + } + + if (CompositeNodes.empty()) { + LLVM_DEBUG(dbgs() << "No composite nodes found.\n"); + return false; + } + + sortCompositeNodes(ConvergingI->getParent()); + + for (auto *It = CompositeNodes.begin() + 1; It != CompositeNodes.end(); + It++) { + auto CN = *It; + auto PrevCN = *(It - 1); + if (haveSharedUses(CN->OutputNode, PrevCN->OutputNode)) { + CN->Accumulator = PrevCN->OutputNode; + PrevCN->Accumulatee = CN->OutputNode; + } + } + + return true; +} + +bool ComplexDeinterleavingGraph::replaceNodes(const TargetLowering *TL) { + if (CompositeNodes.empty()) + return false; + + unsigned GeneratedIntrinsics = 0; + auto *ConvergingI = Instructions[0]; + + auto TTI = TL->getTargetMachine().getTargetTransformInfo( + *ConvergingI->getFunction()); + for (const auto &CN : CompositeNodes) { + auto *N = cast(CN->OutputNode); + + // Wrangle the inputs + + /// If the given value is part of a CompositeNode, and said node is part of + /// an accumulator chain, return the accumulator. Otherwise, returns the + /// "best fit" value (the ReplacementNode of a containing CompositeNode, or + /// the value itself) + auto FollowAccumulatorIfNecessary = [&](Value *V) -> Value * { + auto *I = dyn_cast(V); + if (!I) + return V; + + auto CN = getContainingComposite(I); + if (!CN) + return I; + + if (CN->Accumulatee) + CN = getContainingComposite(cast(CN->Accumulatee)); + + return CN->ReplacementNode; + }; + + /// Given a value and an operand index, get said operand and return it. + /// If the discovered operand is part of a composite node, return the + /// replacement instead. + auto GetInputFromOriginalInput = [&](Value *OriginalInput, + unsigned OpIdx) -> Value * { + auto *OriginalI = cast(OriginalInput); + if (OriginalI->getOpcode() == Instruction::FNeg) + OpIdx = 0; + + auto *Op = OriginalI->getOperand(OpIdx); + if (auto *SVI = dyn_cast(Op)) + Op = SVI->getOperand(0); + + if (!Op) + return nullptr; + + if (auto *I = dyn_cast(Op)) { + if (auto Containing = getContainingComposite(I)) { + if (Containing->ReplacementNode) + return Containing->ReplacementNode; + } + } + return Op; + }; + + if (CN->Operation == llvm::ComplexDeinterleavingOperation::CAdd) { + Value *Sub = nullptr; + if (auto *Op0 = dyn_cast(CN->OriginalInput0)) { + if (Op0->getOpcode() == Instruction::FSub) + Sub = Op0; + } + if (!Sub) { + if (auto *Op1 = dyn_cast(CN->OriginalInput1)) { + if (Op1->getOpcode() == Instruction::FSub) + Sub = Op1; + } + } + + if (!Sub) + return false; + + CN->Input0 = + FollowAccumulatorIfNecessary(GetInputFromOriginalInput(Sub, 0)); + CN->Input1 = + FollowAccumulatorIfNecessary(GetInputFromOriginalInput(Sub, 1)); + } else { + CN->Input0 = FollowAccumulatorIfNecessary( + GetInputFromOriginalInput(CN->OriginalInput0, 0)); + CN->Input1 = FollowAccumulatorIfNecessary( + GetInputFromOriginalInput(CN->OriginalInput1, 0)); + + if (CN->OriginalInput0 != CN->OriginalInput1 && CN->Input0 == CN->Input1) + CN->Input1 = FollowAccumulatorIfNecessary( + GetInputFromOriginalInput(CN->OriginalInput1, 1)); + } + + if (CN->Input0 == nullptr || CN->Input1 == nullptr) { + LLVM_DEBUG(dbgs() << "Couldn't find inputs. Skipping...\n"); + continue; + } + + if (CN->Accumulator) { + if (auto Node = + getContainingComposite(cast(CN->Accumulator))) + CN->Accumulator = cast(Node->ReplacementNode); + } + + if (CN->Operation == llvm::ComplexDeinterleavingOperation::CMulPartial && + CN->Accumulator) { + if (auto Node = + getContainingComposite(cast(CN->Accumulator))) { + bool Valid90 = (Node->Rotation == 0 && CN->Rotation == 90) || + (Node->Rotation == 90 && CN->Rotation == 0); + bool Valid270 = (Node->Rotation == 180 && CN->Rotation == 270) || + (Node->Rotation == 270 && CN->Rotation == 180); + if (!Valid90 && !Valid270) { + LLVM_DEBUG(dbgs() << "Invalid rotation pairs.\n"); + return false; + } + + CN->Input0 = Node->Input0; + CN->Input1 = Node->Input1; + } + } + if (CN->Operation == llvm::ComplexDeinterleavingOperation::CAdd) { + Instruction *FAdd = cast(CN->OriginalInput0); + if (FAdd->getOpcode() != Instruction::FAdd) + FAdd = cast(CN->OriginalInput1); + auto RightComponent = getOperatingComponentOfValue(FAdd->getOperand(1)); + + if (RightComponent != OperatingComponent::Real) { + LLVM_DEBUG(dbgs() << "CAdd.FAdd[1] should be the real component.\n"); + return false; + } + } + + CN->ReplacementNode = TL->createComplexDeinterleavingIR( + N, CN->Operation, CN->Rotation, CN->Input0, CN->Input1, + CN->Accumulator); + if (!CN->ReplacementNode) { + LLVM_DEBUG(dbgs() << "Target failed to create Intrinsic call.\n"); + return false; + } + + cast(CN->ReplacementNode) + ->moveAfter(cast(CN->OutputNode)); + + CostOfIntrinsics += TTI.getInstructionCost( + cast(CN->ReplacementNode), CostKind); + GeneratedIntrinsics += 1; + } + + auto *R = getFinalInputReplacement(ConvergingI); + if (!R) { + LLVM_DEBUG(dbgs() << "Unable to find Final Input Replacement.\n"); + return false; + } + + InstructionCost CostOfNodes; + for (const auto &I : Instructions) + CostOfNodes += TTI.getInstructionCost(I, CostKind); + + LLVM_DEBUG(dbgs() << "Evaluating cost of each graph. Instructions: " + << CostOfNodes << ", Intrinsics: " << CostOfIntrinsics + << ".\n"); + if (CostOfIntrinsics > CostOfNodes) { + LLVM_DEBUG(dbgs() << "Not replacing, cost was too high.\n"); + return false; + } + + cast(R)->getParent()->dump(); + ConvergingI->replaceAllUsesWith(R); + + NumComplexIntrinsics += GeneratedIntrinsics; + + return true; +} + +void ComplexDeinterleavingGraph::getDeadRoots( + SmallVector &DeadInstrRoots) { + for (const auto &CN : CompositeNodes) { + if (auto *I = dyn_cast_or_null(CN->ReplacementNode)) + DeadInstrRoots.push_back(I); + } +} diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -740,6 +740,15 @@ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool isComplexDeinterleavingSupported() const override; + bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const override; + + Value *createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const override; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21727,3 +21727,77 @@ MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } + +bool ARMTargetLowering::isComplexDeinterleavingSupported() const { + return Subtarget->hasMVEFloatOps(); +} + +bool ARMTargetLowering::isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + auto *VTy = dyn_cast(Ty); + if (!VTy) + return false; + + if (VTy->getNumElements() * VTy->getScalarSizeInBits() != 128) + return false; + + // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32 + auto *ScalarTy = VTy->getScalarType(); + if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy()) + return true; + + return false; +} + +Value *ARMTargetLowering::createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + unsigned Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { + + IRBuilder<> B(I); + auto *IntTy = Type::getInt32Ty(B.getContext()); + auto *Ty = InputA->getType(); + + if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + + ConstantInt *ConstMulRot = nullptr; + + if (Rotation == 0) + ConstMulRot = ConstantInt::get(IntTy, 0); + else if (Rotation == 90) + ConstMulRot = ConstantInt::get(IntTy, 1); + else if (Rotation == 180) + ConstMulRot = ConstantInt::get(IntTy, 2); + else if (Rotation == 270) + ConstMulRot = ConstantInt::get(IntTy, 3); + + if (!ConstMulRot) + return nullptr; + + if (Accumulator) + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty, + {ConstMulRot, Accumulator, InputB, InputA}); + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty, + {ConstMulRot, InputB, InputA}); + } + + if (OperationType == ComplexDeinterleavingOperation::CAdd) { + + // 1 means the value is not halved. + unsigned HalvingVal = 1; + auto *Halving = ConstantInt::get(IntTy, HalvingVal); + + unsigned RotKey; + if (Rotation == 90) + RotKey = 0; + else if (Rotation == 270) + RotKey = 1; + else + return nullptr; // Invalid rotation for arm_mve_vcaddq + + auto *RotVal = ConstantInt::get(IntTy, RotKey); + return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty, + {Halving, RotVal, InputA, InputB}); + } + + return nullptr; +} diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -426,12 +426,17 @@ TargetPassConfig::addIRPasses(); // Run the parallel DSP pass. - if (getOptLevel() == CodeGenOpt::Aggressive) + if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createARMParallelDSPPass()); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexDeinterleavingPass(TM)); + // Match interleaved memory accesses to ldN/stN intrinsics. - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedAccessPass()); + } // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll @@ -0,0 +1,342 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + + +define <2 x half> @complex_add_v2f16(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-LABEL: complex_add_v2f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vadd.f16 s2, s2, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vsub.f16 s0, s4, s0 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x half> %b.real, %a.imag + %1 = fadd fast <1 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> + ret <2 x half> %interleaved.vec +} +define <4 x half> @complex_add_v4f16(<4 x half> %a, <4 x half> %b) #0 { +; CHECK-LABEL: complex_add_v4f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vsub.f16 q1, q2, q1 +; CHECK-NEXT: vadd.f16 q0, q3, q0 +; CHECK-NEXT: vmovx.f16 s5, s4 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x half> %b.real, %a.imag + %1 = fadd fast <2 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> + ret <4 x half> %interleaved.vec +} +define <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: complex_add_v8f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vcadd.f16 q0, q1, q0, #90 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x half> %b.real, %a.imag + %1 = fadd fast <4 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> + ret <8 x half> %interleaved.vec +} +define <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) #0 { +; CHECK-LABEL: complex_add_v16f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov d2, r2, r3 +; CHECK-NEXT: vldr d3, [sp, #32] +; CHECK-NEXT: add r1, sp, #40 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vmovx.f16 s2, s7 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vins.f16 s2, s12 +; CHECK-NEXT: vmovx.f16 s3, s10 +; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: add r1, sp, #56 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: add r1, sp, #72 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vmovx.f16 s16, s12 +; CHECK-NEXT: vmovx.f16 s5, s13 +; CHECK-NEXT: vmovx.f16 s17, s14 +; CHECK-NEXT: vins.f16 s16, s5 +; CHECK-NEXT: vmovx.f16 s5, s15 +; CHECK-NEXT: vins.f16 s17, s5 +; CHECK-NEXT: vmovx.f16 s18, s20 +; CHECK-NEXT: vmovx.f16 s5, s21 +; CHECK-NEXT: vmovx.f16 s19, s22 +; CHECK-NEXT: vins.f16 s18, s5 +; CHECK-NEXT: vmovx.f16 s5, s23 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vins.f16 s10, s11 +; CHECK-NEXT: vins.f16 s14, s15 +; CHECK-NEXT: vins.f16 s22, s23 +; CHECK-NEXT: vins.f16 s20, s21 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s19, s5 +; CHECK-NEXT: vins.f16 s12, s13 +; CHECK-NEXT: vmov.f32 s13, s14 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vmov.f32 s15, s22 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vsub.f16 q0, q3, q0 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vadd.f16 q1, q4, q1 +; CHECK-NEXT: vmovx.f16 s9, s2 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vmovx.f16 s6, s6 +; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s4, s1 +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vins.f16 s9, s6 +; CHECK-NEXT: vmovx.f16 s11, s3 +; CHECK-NEXT: vins.f16 s3, s7 +; CHECK-NEXT: vmovx.f16 s6, s7 +; CHECK-NEXT: vins.f16 s4, s8 +; CHECK-NEXT: vins.f16 s1, s5 +; CHECK-NEXT: vins.f16 s11, s6 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vstrw.32 q2, [r0, #16] +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmov.f32 s9, s12 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s11, s4 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x half> %b.real, %a.imag + %1 = fadd fast <8 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> + ret <16 x half> %interleaved.vec +} +define <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) #0 { +; CHECK-LABEL: complex_add_v32f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: vmov d2, r2, r3 +; CHECK-NEXT: vldr d3, [sp, #72] +; CHECK-NEXT: add r1, sp, #128 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s7 +; CHECK-NEXT: add r1, sp, #144 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: vmovx.f16 s2, s21 +; CHECK-NEXT: vmovx.f16 s12, s20 +; CHECK-NEXT: vmov.f32 s8, s20 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vmov.f32 s9, s22 +; CHECK-NEXT: vmovx.f16 s2, s23 +; CHECK-NEXT: vmovx.f16 s13, s22 +; CHECK-NEXT: add r1, sp, #80 +; CHECK-NEXT: vmovx.f16 s14, s16 +; CHECK-NEXT: vins.f16 s13, s2 +; CHECK-NEXT: vmovx.f16 s2, s17 +; CHECK-NEXT: vins.f16 s8, s21 +; CHECK-NEXT: vins.f16 s9, s23 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vmovx.f16 s15, s18 +; CHECK-NEXT: vins.f16 s14, s2 +; CHECK-NEXT: vmovx.f16 s2, s19 +; CHECK-NEXT: vmovx.f16 s10, s21 +; CHECK-NEXT: vins.f16 s15, s2 +; CHECK-NEXT: vmovx.f16 s2, s20 +; CHECK-NEXT: vins.f16 s18, s19 +; CHECK-NEXT: vmovx.f16 s3, s22 +; CHECK-NEXT: vins.f16 s2, s10 +; CHECK-NEXT: vmovx.f16 s10, s23 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vins.f16 s22, s23 +; CHECK-NEXT: vins.f16 s16, s17 +; CHECK-NEXT: vins.f16 s20, s21 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vins.f16 s3, s10 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: add r1, sp, #96 +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vmov.f32 s11, s18 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vmov.f32 s6, s20 +; CHECK-NEXT: vsub.f16 q0, q2, q0 +; CHECK-NEXT: vmov.f32 s7, s22 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vadd.f16 q1, q3, q1 +; CHECK-NEXT: vmovx.f16 s9, s2 +; CHECK-NEXT: vins.f16 s0, s4 +; CHECK-NEXT: vmovx.f16 s4, s4 +; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s4, s5 +; CHECK-NEXT: vstr s8, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vins.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s4, s6 +; CHECK-NEXT: vins.f16 s9, s4 +; CHECK-NEXT: vmovx.f16 s11, s3 +; CHECK-NEXT: vmovx.f16 s4, s7 +; CHECK-NEXT: vmovx.f16 s12, s16 +; CHECK-NEXT: vins.f16 s11, s4 +; CHECK-NEXT: vmovx.f16 s4, s17 +; CHECK-NEXT: add r1, sp, #160 +; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vldrw.u32 q7, [r1] +; CHECK-NEXT: vmovx.f16 s13, s18 +; CHECK-NEXT: vmovx.f16 s4, s19 +; CHECK-NEXT: add r1, sp, #112 +; CHECK-NEXT: vins.f16 s3, s7 +; CHECK-NEXT: vins.f16 s1, s5 +; CHECK-NEXT: vins.f16 s2, s6 +; CHECK-NEXT: vins.f16 s13, s4 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstr s8, [sp] @ 4-byte Spill +; CHECK-NEXT: add r1, sp, #176 +; CHECK-NEXT: vmov.f32 s20, s28 +; CHECK-NEXT: vmovx.f16 s14, s4 +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vins.f16 s14, s8 +; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vmovx.f16 s15, s6 +; CHECK-NEXT: vmovx.f16 s8, s7 +; CHECK-NEXT: vins.f16 s18, s19 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vins.f16 s15, s8 +; CHECK-NEXT: vins.f16 s16, s17 +; CHECK-NEXT: vmov.f32 s17, s18 +; CHECK-NEXT: vins.f16 s20, s29 +; CHECK-NEXT: vmovx.f16 s8, s29 +; CHECK-NEXT: vmov.f32 s18, s4 +; CHECK-NEXT: vmovx.f16 s4, s31 +; CHECK-NEXT: vmovx.f16 s29, s30 +; CHECK-NEXT: vmov.f32 s21, s30 +; CHECK-NEXT: vins.f16 s29, s4 +; CHECK-NEXT: vmovx.f16 s4, s25 +; CHECK-NEXT: vmovx.f16 s30, s24 +; CHECK-NEXT: vmov.f32 s22, s24 +; CHECK-NEXT: vmovx.f16 s28, s28 +; CHECK-NEXT: vins.f16 s21, s31 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vins.f16 s30, s4 +; CHECK-NEXT: vmovx.f16 s31, s26 +; CHECK-NEXT: vmovx.f16 s4, s27 +; CHECK-NEXT: vins.f16 s26, s27 +; CHECK-NEXT: vins.f16 s22, s25 +; CHECK-NEXT: vmov.f32 s23, s26 +; CHECK-NEXT: vins.f16 s28, s8 +; CHECK-NEXT: vmov.f32 s19, s6 +; CHECK-NEXT: vins.f16 s31, s4 +; CHECK-NEXT: vsub.f16 q5, q5, q3 +; CHECK-NEXT: vadd.f16 q4, q7, q4 +; CHECK-NEXT: vmovx.f16 s13, s22 +; CHECK-NEXT: vmovx.f16 s4, s18 +; CHECK-NEXT: vmovx.f16 s15, s23 +; CHECK-NEXT: vins.f16 s13, s4 +; CHECK-NEXT: vmovx.f16 s4, s19 +; CHECK-NEXT: vins.f16 s15, s4 +; CHECK-NEXT: vmovx.f16 s4, s20 +; CHECK-NEXT: vmovx.f16 s6, s16 +; CHECK-NEXT: vins.f16 s22, s18 +; CHECK-NEXT: vins.f16 s23, s19 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s21 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vins.f16 s20, s16 +; CHECK-NEXT: vins.f16 s21, s17 +; CHECK-NEXT: vmov q4, q5 +; CHECK-NEXT: vins.f16 s6, s8 +; CHECK-NEXT: vmov.f32 s12, s22 +; CHECK-NEXT: vmov.f32 s14, s23 +; CHECK-NEXT: vmov.f32 s17, s4 +; CHECK-NEXT: vstrw.32 q3, [r0, #48] +; CHECK-NEXT: vmov.f32 s19, s6 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vmov.f32 s18, s21 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vstrw.32 q4, [r0, #32] +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vstrw.32 q2, [r0, #16] +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vldr s5, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vldr s7, [sp] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> + %a.imag = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> + %b.real = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> + %b.imag = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> + %0 = fsub fast <16 x half> %b.real, %a.imag + %1 = fadd fast <16 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> + ret <32 x half> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll @@ -0,0 +1,387 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) #0 { +; CHECK-LABEL: complex_mul_v2f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vmov d2, r0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmul.f16 s6, s2, s4 +; CHECK-NEXT: vfma.f16 s6, s0, s8 +; CHECK-NEXT: vmul.f16 s8, s8, s2 +; CHECK-NEXT: vfnms.f16 s8, s0, s4 +; CHECK-NEXT: vins.f16 s8, s6 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> + %a.imag = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> + %b.real = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> + %b.imag = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> + %0 = fmul fast <1 x half> %b.imag, %a.real + %1 = fmul fast <1 x half> %b.real, %a.imag + %2 = fadd fast <1 x half> %1, %0 + %3 = fmul fast <1 x half> %b.real, %a.real + %4 = fmul fast <1 x half> %a.imag, %b.imag + %5 = fsub fast <1 x half> %3, %4 + %interleaved.vec = shufflevector <1 x half> %5, <1 x half> %2, <2 x i32> + ret <2 x half> %interleaved.vec +} + +define <4 x half> @complex_mul_v4f16(<4 x half> %a, <4 x half> %b) #0 { +; CHECK-LABEL: complex_mul_v4f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vmul.f16 q4, q3, q0 +; CHECK-NEXT: vfma.f16 q4, q2, q1 +; CHECK-NEXT: vmul.f16 q1, q1, q3 +; CHECK-NEXT: vneg.f16 q1, q1 +; CHECK-NEXT: vfma.f16 q1, q2, q0 +; CHECK-NEXT: vmovx.f16 s0, s16 +; CHECK-NEXT: vmovx.f16 s5, s4 +; CHECK-NEXT: vins.f16 s4, s16 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> + %a.imag = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> + %b.real = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> + %b.imag = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> + %0 = fmul fast <2 x half> %b.imag, %a.real + %1 = fmul fast <2 x half> %b.real, %a.imag + %2 = fadd fast <2 x half> %1, %0 + %3 = fmul fast <2 x half> %b.real, %a.real + %4 = fmul fast <2 x half> %a.imag, %b.imag + %5 = fsub fast <2 x half> %3, %4 + %interleaved.vec = shufflevector <2 x half> %5, <2 x half> %2, <4 x i32> + ret <4 x half> %interleaved.vec +} + +define <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) #0 { +; CHECK-LABEL: complex_mul_v8f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vcmul.f16 q2, q0, q1, #90 +; CHECK-NEXT: vcmla.f16 q2, q0, q1, #0 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> + %a.imag = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> + %b.real = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> + %b.imag = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> + %0 = fmul fast <4 x half> %b.imag, %a.real + %1 = fmul fast <4 x half> %b.real, %a.imag + %2 = fadd fast <4 x half> %1, %0 + %3 = fmul fast <4 x half> %b.real, %a.real + %4 = fmul fast <4 x half> %a.imag, %b.imag + %5 = fsub fast <4 x half> %3, %4 + %interleaved.vec = shufflevector <4 x half> %5, <4 x half> %2, <8 x i32> + ret <8 x half> %interleaved.vec +} + +define <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) #0 { +; CHECK-LABEL: complex_mul_v16f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vldr d1, [sp, #32] +; CHECK-NEXT: add r1, sp, #56 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vmovx.f16 s6, s1 +; CHECK-NEXT: vmovx.f16 s5, s2 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s3 +; CHECK-NEXT: add r1, sp, #72 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vins.f16 s5, s6 +; CHECK-NEXT: vmovx.f16 s6, s21 +; CHECK-NEXT: vmovx.f16 s12, s20 +; CHECK-NEXT: vmov.f32 s8, s20 +; CHECK-NEXT: vins.f16 s12, s6 +; CHECK-NEXT: vmov.f32 s9, s22 +; CHECK-NEXT: vmovx.f16 s6, s23 +; CHECK-NEXT: vmovx.f16 s13, s22 +; CHECK-NEXT: add r1, sp, #40 +; CHECK-NEXT: vmovx.f16 s14, s16 +; CHECK-NEXT: vins.f16 s13, s6 +; CHECK-NEXT: vmovx.f16 s6, s17 +; CHECK-NEXT: vins.f16 s8, s21 +; CHECK-NEXT: vins.f16 s9, s23 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vmovx.f16 s15, s18 +; CHECK-NEXT: vins.f16 s14, s6 +; CHECK-NEXT: vmovx.f16 s6, s19 +; CHECK-NEXT: vmovx.f16 s7, s22 +; CHECK-NEXT: vins.f16 s15, s6 +; CHECK-NEXT: vmovx.f16 s6, s20 +; CHECK-NEXT: vmovx.f16 s10, s21 +; CHECK-NEXT: vins.f16 s2, s3 +; CHECK-NEXT: vins.f16 s22, s23 +; CHECK-NEXT: vins.f16 s20, s21 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s18, s19 +; CHECK-NEXT: vins.f16 s6, s10 +; CHECK-NEXT: vmovx.f16 s10, s23 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vins.f16 s16, s17 +; CHECK-NEXT: vmov.f32 s2, s20 +; CHECK-NEXT: vins.f16 s7, s10 +; CHECK-NEXT: vmov.f32 s3, s22 +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vmov.f32 s11, s18 +; CHECK-NEXT: vmul.f16 q4, q3, q0 +; CHECK-NEXT: vfma.f16 q4, q2, q1 +; CHECK-NEXT: vmul.f16 q1, q1, q3 +; CHECK-NEXT: vneg.f16 q1, q1 +; CHECK-NEXT: vfma.f16 q1, q2, q0 +; CHECK-NEXT: vmovx.f16 s0, s16 +; CHECK-NEXT: vmovx.f16 s8, s4 +; CHECK-NEXT: vmovx.f16 s10, s5 +; CHECK-NEXT: vins.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s0, s17 +; CHECK-NEXT: vins.f16 s10, s0 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vmovx.f16 s0, s18 +; CHECK-NEXT: vmovx.f16 s3, s7 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vins.f16 s7, s19 +; CHECK-NEXT: vmovx.f16 s0, s19 +; CHECK-NEXT: vins.f16 s6, s18 +; CHECK-NEXT: vins.f16 s3, s0 +; CHECK-NEXT: vins.f16 s4, s16 +; CHECK-NEXT: vins.f16 s5, s17 +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vstrw.32 q0, [r0, #16] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> + %a.imag = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> + %b.real = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + %b.imag = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + %0 = fmul fast <8 x half> %b.imag, %a.real + %1 = fmul fast <8 x half> %b.real, %a.imag + %2 = fadd fast <8 x half> %1, %0 + %3 = fmul fast <8 x half> %b.real, %a.real + %4 = fmul fast <8 x half> %a.imag, %b.imag + %5 = fsub fast <8 x half> %3, %4 + %interleaved.vec = shufflevector <8 x half> %5, <8 x half> %2, <16 x i32> + ret <16 x half> %interleaved.vec +} + +define <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) #0 { +; CHECK-LABEL: complex_mul_v32f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: vmov d2, r2, r3 +; CHECK-NEXT: vldr d3, [sp, #104] +; CHECK-NEXT: add r1, sp, #112 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s1, s6 +; CHECK-NEXT: vmovx.f16 s2, s7 +; CHECK-NEXT: vmovx.f16 s12, s9 +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vins.f16 s2, s12 +; CHECK-NEXT: vmovx.f16 s3, s10 +; CHECK-NEXT: vmovx.f16 s12, s11 +; CHECK-NEXT: add r1, sp, #160 +; CHECK-NEXT: vins.f16 s3, s12 +; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: add r1, sp, #176 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vmovx.f16 s16, s12 +; CHECK-NEXT: vmovx.f16 s5, s13 +; CHECK-NEXT: vmovx.f16 s17, s14 +; CHECK-NEXT: vins.f16 s16, s5 +; CHECK-NEXT: vmovx.f16 s5, s15 +; CHECK-NEXT: vins.f16 s17, s5 +; CHECK-NEXT: vmovx.f16 s18, s20 +; CHECK-NEXT: vmovx.f16 s5, s21 +; CHECK-NEXT: vmovx.f16 s19, s22 +; CHECK-NEXT: vins.f16 s18, s5 +; CHECK-NEXT: vmovx.f16 s5, s23 +; CHECK-NEXT: vins.f16 s6, s7 +; CHECK-NEXT: vins.f16 s10, s11 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s19, s5 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vins.f16 s14, s15 +; CHECK-NEXT: vins.f16 s22, s23 +; CHECK-NEXT: vins.f16 s20, s21 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vins.f16 s12, s13 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: add r1, sp, #144 +; CHECK-NEXT: vmov.f32 s13, s14 +; CHECK-NEXT: vldrw.u32 q7, [r1] +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: add r1, sp, #192 +; CHECK-NEXT: vmov.f32 s15, s22 +; CHECK-NEXT: vmul.f16 q5, q4, q1 +; CHECK-NEXT: vfma.f16 q5, q3, q0 +; CHECK-NEXT: vmul.f16 q0, q0, q4 +; CHECK-NEXT: vneg.f16 q2, q0 +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vfma.f16 q2, q3, q1 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s1, s10 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vstr s2, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmovx.f16 s0, s22 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vmovx.f16 s3, s11 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: add r1, sp, #208 +; CHECK-NEXT: vins.f16 s3, s0 +; CHECK-NEXT: vmovx.f16 s14, s28 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s0, s29 +; CHECK-NEXT: vins.f16 s8, s20 +; CHECK-NEXT: vins.f16 s9, s21 +; CHECK-NEXT: vins.f16 s10, s22 +; CHECK-NEXT: vins.f16 s11, s23 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmovx.f16 s15, s30 +; CHECK-NEXT: vmovx.f16 s0, s31 +; CHECK-NEXT: vins.f16 s15, s0 +; CHECK-NEXT: vmovx.f16 s27, s22 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: add r1, sp, #128 +; CHECK-NEXT: vstr s2, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: vins.f16 s27, s0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s24, s4 +; CHECK-NEXT: vmovx.f16 s25, s6 +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vins.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s8, s5 +; CHECK-NEXT: vins.f16 s1, s3 +; CHECK-NEXT: vins.f16 s24, s8 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vins.f16 s25, s0 +; CHECK-NEXT: vmovx.f16 s0, s21 +; CHECK-NEXT: vmovx.f16 s26, s20 +; CHECK-NEXT: vins.f16 s30, s31 +; CHECK-NEXT: vins.f16 s28, s29 +; CHECK-NEXT: vins.f16 s26, s0 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vmovx.f16 s13, s2 +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vins.f16 s13, s0 +; CHECK-NEXT: vmov.f32 s17, s6 +; CHECK-NEXT: vins.f16 s22, s23 +; CHECK-NEXT: vmov.f32 s18, s20 +; CHECK-NEXT: vins.f16 s16, s5 +; CHECK-NEXT: vmov.f32 s10, s28 +; CHECK-NEXT: vins.f16 s17, s7 +; CHECK-NEXT: vmov.f32 s11, s30 +; CHECK-NEXT: vins.f16 s18, s21 +; CHECK-NEXT: vmul.f16 q1, q6, q2 +; CHECK-NEXT: vmov.f32 s19, s22 +; CHECK-NEXT: vmul.f16 q0, q3, q6 +; CHECK-NEXT: vfma.f16 q1, q4, q3 +; CHECK-NEXT: vneg.f16 q3, q0 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vfma.f16 q3, q4, q2 +; CHECK-NEXT: vmovx.f16 s1, s14 +; CHECK-NEXT: vmovx.f16 s3, s15 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vmovx.f16 s0, s7 +; CHECK-NEXT: vins.f16 s14, s6 +; CHECK-NEXT: vins.f16 s3, s0 +; CHECK-NEXT: vmovx.f16 s6, s12 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vins.f16 s15, s7 +; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vins.f16 s6, s0 +; CHECK-NEXT: vmovx.f16 s4, s13 +; CHECK-NEXT: vins.f16 s13, s5 +; CHECK-NEXT: vmovx.f16 s0, s5 +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmov.f32 s0, s14 +; CHECK-NEXT: vmov.f32 s2, s15 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vstrw.32 q0, [r0, #48] +; CHECK-NEXT: vmov.f32 s10, s13 +; CHECK-NEXT: vmov.f32 s11, s4 +; CHECK-NEXT: vstrw.32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s0, s6 +; CHECK-NEXT: vmov.f32 s2, s7 +; CHECK-NEXT: vstrw.32 q0, [r0, #16] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: vldr s1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vldr s3, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> + %a.imag = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> + %b.real = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + %b.imag = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + %0 = fmul fast <16 x half> %b.imag, %a.real + %1 = fmul fast <16 x half> %b.real, %a.imag + %2 = fadd fast <16 x half> %1, %0 + %3 = fmul fast <16 x half> %b.real, %a.real + %4 = fmul fast <16 x half> %a.imag, %b.imag + %5 = fsub fast <16 x half> %3, %4 + %interleaved.vec = shufflevector <16 x half> %5, <16 x half> %2, <32 x i32> + ret <32 x half> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll @@ -0,0 +1,192 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + + +define <2 x float> @complex_add_v2f32(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: complex_add_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vadd.f32 s5, s5, s0 +; CHECK-NEXT: vsub.f32 s4, s4, s1 +; CHECK-NEXT: vmov r0, r1, d2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x float> %b.real, %a.imag + %1 = fadd fast <1 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> + ret <2 x float> %interleaved.vec +} +define <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: complex_add_v4f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vcadd.f32 q2, q1, q0, #90 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x float> %b.real, %a.imag + %1 = fadd fast <2 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} +define <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) #0 { +; CHECK-LABEL: complex_add_v8f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: add.w lr, sp, #48 +; CHECK-NEXT: add r1, sp, #64 +; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vldrw.u32 q1, [lr] +; CHECK-NEXT: add.w r12, sp, #32 +; CHECK-NEXT: vldr d1, [sp, #24] +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s10, s13 +; CHECK-NEXT: vmov.f32 s11, s15 +; CHECK-NEXT: vmov.f32 s6, s12 +; CHECK-NEXT: vmov.f32 s7, s14 +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmov.f32 s17, s2 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s18, s12 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s2, s13 +; CHECK-NEXT: vadd.f32 q2, q2, q4 +; CHECK-NEXT: vmov.f32 s3, s15 +; CHECK-NEXT: vsub.f32 q0, q1, q0 +; CHECK-NEXT: vmov.f32 s5, s10 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vstrw.32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s4, s0 +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r7, pc} +entry: + %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x float> %b.real, %a.imag + %1 = fadd fast <4 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + ret <8 x float> %interleaved.vec +} +define <16 x float> @complex_add_v16f32(<16 x float> %a, <16 x float> %b) #0 { +; CHECK-LABEL: complex_add_v16f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add r4, sp, #152 +; CHECK-NEXT: add r5, sp, #88 +; CHECK-NEXT: add r6, sp, #168 +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: add r2, sp, #104 +; CHECK-NEXT: vldrw.u32 q5, [r5] +; CHECK-NEXT: vldrw.u32 q2, [r2] +; CHECK-NEXT: vldrw.u32 q3, [r6] +; CHECK-NEXT: vldrw.u32 q4, [r4] +; CHECK-NEXT: vmov.f32 s4, s20 +; CHECK-NEXT: vmov.f32 s5, s22 +; CHECK-NEXT: add.w lr, sp, #120 +; CHECK-NEXT: vmov.f32 s24, s17 +; CHECK-NEXT: add r1, sp, #136 +; CHECK-NEXT: vmov.f32 s25, s19 +; CHECK-NEXT: add.w r12, sp, #72 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vldr d1, [sp, #64] +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmov.f32 s26, s13 +; CHECK-NEXT: vmov.f32 s27, s15 +; CHECK-NEXT: vadd.f32 q1, q6, q1 +; CHECK-NEXT: vmov.f32 s20, s21 +; CHECK-NEXT: vmov.f32 s21, s23 +; CHECK-NEXT: vmov.f32 s22, s9 +; CHECK-NEXT: vmov.f32 s23, s11 +; CHECK-NEXT: vmov.f32 s17, s18 +; CHECK-NEXT: vmov.f32 s18, s12 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vsub.f32 q4, q4, q5 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vldrw.u32 q1, [lr] +; CHECK-NEXT: vmov.f32 s8, s18 +; CHECK-NEXT: vmov.f32 s10, s19 +; CHECK-NEXT: vmov.f32 s12, s16 +; CHECK-NEXT: vstrw.32 q2, [r0, #48] +; CHECK-NEXT: vmov.f32 s14, s17 +; CHECK-NEXT: vmov.f32 s16, s5 +; CHECK-NEXT: vstrw.32 q3, [r0, #32] +; CHECK-NEXT: vmov.f32 s17, s7 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s18, s21 +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vmov.f32 s6, s20 +; CHECK-NEXT: vmov.f32 s7, s22 +; CHECK-NEXT: vldrw.u32 q5, [r12] +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s9, s2 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s10, s20 +; CHECK-NEXT: vmov.f32 s11, s22 +; CHECK-NEXT: vmov.f32 s2, s21 +; CHECK-NEXT: vadd.f32 q2, q4, q2 +; CHECK-NEXT: vmov.f32 s3, s23 +; CHECK-NEXT: vsub.f32 q0, q1, q0 +; CHECK-NEXT: vmov.f32 s5, s10 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vstrw.32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s4, s0 +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %a.real = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x float> %b.real, %a.imag + %1 = fadd fast <8 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> + ret <16 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll @@ -0,0 +1,222 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +define <2 x float> @complex_mul_v2f32(<2 x float> %a, <2 x float> %b) #0 { +; CHECK-LABEL: complex_mul_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov r2, sp +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vmov r2, r3, d0 +; CHECK-NEXT: vmul.f32 s9, s5, s0 +; CHECK-NEXT: vmul.f32 s8, s1, s5 +; CHECK-NEXT: vfma.f32 s9, s4, s1 +; CHECK-NEXT: vfnms.f32 s8, s4, s0 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> + %a.imag = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> + %b.real = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> + %b.imag = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> + %0 = fmul fast <1 x float> %b.imag, %a.real + %1 = fmul fast <1 x float> %b.real, %a.imag + %2 = fadd fast <1 x float> %1, %0 + %3 = fmul fast <1 x float> %b.real, %a.real + %4 = fmul fast <1 x float> %a.imag, %b.imag + %5 = fsub fast <1 x float> %3, %4 + %interleaved.vec = shufflevector <1 x float> %5, <1 x float> %2, <2 x i32> + ret <2 x float> %interleaved.vec +} + +define <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) #0 { +; CHECK-LABEL: complex_mul_v4f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #90 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #0 +; CHECK-NEXT: vmov r0, r1, d4 +; CHECK-NEXT: vmov r2, r3, d5 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %a.real + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %a.real + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> + ret <4 x float> %interleaved.vec +} + +define <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) #0 { +; CHECK-LABEL: complex_mul_v8f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: add.w lr, sp, #64 +; CHECK-NEXT: add.w r12, sp, #48 +; CHECK-NEXT: add r1, sp, #80 +; CHECK-NEXT: vldrw.u32 q3, [lr] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q4, [r12] +; CHECK-NEXT: vldr d1, [sp, #40] +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov.f32 s4, s12 +; CHECK-NEXT: vmov.f32 s5, s14 +; CHECK-NEXT: vmov.f32 s12, s13 +; CHECK-NEXT: vmov.f32 s13, s15 +; CHECK-NEXT: vmov.f32 s20, s0 +; CHECK-NEXT: vmov.f32 s21, s2 +; CHECK-NEXT: vmov.f32 s14, s9 +; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.f32 s22, s16 +; CHECK-NEXT: vmov.f32 s23, s18 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmul.f32 q2, q3, q5 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmov.f32 s3, s19 +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vmul.f32 q0, q0, q3 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vfma.f32 q0, q1, q5 +; CHECK-NEXT: vmov.f32 s5, s10 +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vstrw.32 q1, [r0, #16] +; CHECK-NEXT: vmov.f32 s4, s0 +; CHECK-NEXT: vmov.f32 s5, s8 +; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s9 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r7, pc} +entry: + %a.real = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> + %a.imag = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> + %b.real = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + %b.imag = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + %0 = fmul fast <4 x float> %b.imag, %a.real + %1 = fmul fast <4 x float> %b.real, %a.imag + %2 = fadd fast <4 x float> %1, %0 + %3 = fmul fast <4 x float> %b.real, %a.real + %4 = fmul fast <4 x float> %a.imag, %b.imag + %5 = fsub fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> + ret <8 x float> %interleaved.vec +} + +define <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) #0 { +; CHECK-LABEL: complex_mul_v16f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add r4, sp, #184 +; CHECK-NEXT: add r5, sp, #168 +; CHECK-NEXT: add r6, sp, #120 +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: add r2, sp, #104 +; CHECK-NEXT: vldrw.u32 q4, [r6] +; CHECK-NEXT: vldrw.u32 q6, [r2] +; CHECK-NEXT: vldrw.u32 q5, [r4] +; CHECK-NEXT: vldrw.u32 q3, [r5] +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vmov.f32 s8, s24 +; CHECK-NEXT: add.w lr, sp, #136 +; CHECK-NEXT: vmov.f32 s9, s26 +; CHECK-NEXT: add r1, sp, #152 +; CHECK-NEXT: vmov.f32 s28, s13 +; CHECK-NEXT: add.w r12, sp, #88 +; CHECK-NEXT: vmov.f32 s29, s15 +; CHECK-NEXT: vldr d1, [sp, #80] +; CHECK-NEXT: vmov.f32 s24, s25 +; CHECK-NEXT: vmov.f32 s25, s27 +; CHECK-NEXT: vmov.f32 s30, s21 +; CHECK-NEXT: vmov.f32 s31, s23 +; CHECK-NEXT: vmov.f32 s26, s17 +; CHECK-NEXT: vmov.f32 s27, s19 +; CHECK-NEXT: vmov.f32 s11, s18 +; CHECK-NEXT: vmul.f32 q4, q6, q7 +; CHECK-NEXT: vmov.f32 s13, s14 +; CHECK-NEXT: vmul.f32 q1, q7, q2 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vneg.f32 q4, q4 +; CHECK-NEXT: vmov.f32 s15, s22 +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vfma.f32 q4, q3, q2 +; CHECK-NEXT: vfma.f32 q1, q3, q6 +; CHECK-NEXT: vmov.f32 s13, s4 +; CHECK-NEXT: vmov.f32 s15, s5 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s12, s16 +; CHECK-NEXT: vmov.f32 s14, s17 +; CHECK-NEXT: vmov.f32 s4, s18 +; CHECK-NEXT: vmov.f32 s6, s19 +; CHECK-NEXT: vldrw.u32 q4, [lr] +; CHECK-NEXT: vmov.f32 s10, s20 +; CHECK-NEXT: vstrw.32 q1, [r0, #48] +; CHECK-NEXT: vmov.f32 s8, s16 +; CHECK-NEXT: vstrw.32 q3, [r0, #32] +; CHECK-NEXT: vmov.f32 s9, s18 +; CHECK-NEXT: vmov.f32 s16, s17 +; CHECK-NEXT: vmov.f32 s17, s19 +; CHECK-NEXT: vmov.f32 s11, s22 +; CHECK-NEXT: vmov.f32 s18, s21 +; CHECK-NEXT: vmov.f32 s19, s23 +; CHECK-NEXT: vldrw.u32 q5, [r12] +; CHECK-NEXT: vmov.f32 s12, s0 +; CHECK-NEXT: vmov.f32 s13, s2 +; CHECK-NEXT: vmov.f32 s14, s20 +; CHECK-NEXT: vmov.f32 s15, s22 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmul.f32 q1, q4, q3 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s2, s21 +; CHECK-NEXT: vmov.f32 s3, s23 +; CHECK-NEXT: vfma.f32 q1, q2, q0 +; CHECK-NEXT: vmul.f32 q0, q0, q4 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vfma.f32 q0, q2, q3 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vstrw.32 q2, [r0, #16] +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %a.real = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> + %a.imag = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> + %b.real = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + %b.imag = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + %0 = fmul fast <8 x float> %b.imag, %a.real + %1 = fmul fast <8 x float> %b.real, %a.imag + %2 = fadd fast <8 x float> %1, %0 + %3 = fmul fast <8 x float> %b.real, %a.real + %4 = fmul fast <8 x float> %a.imag, %b.imag + %5 = fsub fast <8 x float> %3, %4 + %interleaved.vec = shufflevector <8 x float> %5, <8 x float> %2, <16 x i32> + ret <16 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll @@ -0,0 +1,199 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + + +define <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: complex_add_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: add r0, sp, #40 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x double> %b.real, %a.imag + %1 = fadd fast <1 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> + ret <2 x double> %interleaved.vec +} +define <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) #0 { +; CHECK-LABEL: complex_add_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: mov r10, r0 +; CHECK-NEXT: add r0, sp, #104 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: mov r9, r2 +; CHECK-NEXT: add r2, sp, #88 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: add r3, sp, #120 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: vldrw.u32 q6, [r2] +; CHECK-NEXT: ldrd r2, r3, [sp, #80] +; CHECK-NEXT: vmov r7, r5, d10 +; CHECK-NEXT: vmov r6, r4, d13 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: vmov d8, r0, r1 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: vmov r0, r1, d11 +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vmov d11, r0, r1 +; CHECK-NEXT: vmov d10, r4, r5 +; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: vmov d9, r0, r1 +; CHECK-NEXT: vstrw.32 q5, [r10, #16] +; CHECK-NEXT: vstrw.32 q4, [r10] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x double> %b.real, %a.imag + %1 = fadd fast <2 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> + ret <4 x double> %interleaved.vec +} +define <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) #0 { +; CHECK-LABEL: complex_add_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: strd r2, r3, [sp, #40] @ 8-byte Folded Spill +; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: add r2, sp, #160 +; CHECK-NEXT: add r0, sp, #224 +; CHECK-NEXT: vldrw.u32 q6, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r0, r1, d12 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vmov.f32 s24, s0 +; CHECK-NEXT: vmov.f32 s25, s1 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: strd r1, r0, [sp, #16] @ 8-byte Folded Spill +; CHECK-NEXT: add r0, sp, #208 +; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: ldrd r2, r3, [sp, #152] +; CHECK-NEXT: vmov r0, r1, d14 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: add r5, sp, #256 +; CHECK-NEXT: add r3, sp, #240 +; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: add r2, sp, #176 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: vldrw.u32 q4, [r2] +; CHECK-NEXT: vmov r4, r5, d0 +; CHECK-NEXT: vmov r6, r7, d10 +; CHECK-NEXT: vmov r2, r3, d9 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s19, s3 +; CHECK-NEXT: vmov r10, r9, d13 +; CHECK-NEXT: strd r4, r5, [sp] @ 8-byte Folded Spill +; CHECK-NEXT: vmov r4, r5, d15 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: strd r4, r5, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: vmov r0, r1, d11 +; CHECK-NEXT: vmov r2, r3, d8 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: vmov r2, r3, d12 +; CHECK-NEXT: add r5, sp, #192 +; CHECK-NEXT: vldrw.u32 q7, [r5] +; CHECK-NEXT: vmov d11, r0, r1 +; CHECK-NEXT: mov r0, r10 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: vmov r11, r5, d15 +; CHECK-NEXT: vmov d10, r6, r7 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: vmov d13, r0, r1 +; CHECK-NEXT: ldrd r1, r0, [sp, #16] @ 8-byte Folded Reload +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: vmov d12, r0, r1 +; CHECK-NEXT: ldrd r0, r1, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: vmov r2, r3, d14 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: vmov d9, r0, r1 +; CHECK-NEXT: ldrd r0, r1, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r2, r3, [sp, #40] @ 8-byte Folded Reload +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: vldrw.u32 q0, [sp, #24] @ 16-byte Reload +; CHECK-NEXT: vmov d8, r5, r6 +; CHECK-NEXT: vmov d1, r0, r1 +; CHECK-NEXT: vstrw.32 q4, [r8, #48] +; CHECK-NEXT: vstrw.32 q5, [r8, #32] +; CHECK-NEXT: vstrw.32 q6, [r8, #16] +; CHECK-NEXT: vstrw.32 q0, [r8] +; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x double> %b.real, %a.imag + %1 = fadd fast <4 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll @@ -0,0 +1,379 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +define <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) #0 { +; CHECK-LABEL: complex_mul_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r10, r3 +; CHECK-NEXT: mov r11, r2 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: strd r0, r1, [sp] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: vmov r8, r9, d8 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r2, r11 +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: ldrd r2, r3, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %0 = fmul fast <1 x double> %b.imag, %a.real + %1 = fmul fast <1 x double> %b.real, %a.imag + %2 = fadd fast <1 x double> %1, %0 + %3 = fmul fast <1 x double> %b.real, %a.real + %4 = fmul fast <1 x double> %a.imag, %b.imag + %5 = fsub fast <1 x double> %3, %4 + %interleaved.vec = shufflevector <1 x double> %5, <1 x double> %2, <2 x i32> + ret <2 x double> %interleaved.vec +} + +define <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) #0 { +; CHECK-LABEL: complex_mul_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .pad #56 +; CHECK-NEXT: sub sp, #56 +; CHECK-NEXT: str r0, [sp, #52] @ 4-byte Spill +; CHECK-NEXT: add r0, sp, #152 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: strd r2, r3, [sp, #28] @ 8-byte Folded Spill +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: strd r0, r1, [sp, #44] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: ldrd r2, r3, [sp, #128] +; CHECK-NEXT: strd r0, r1, [sp, #20] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: add r2, sp, #136 +; CHECK-NEXT: strd r1, r0, [sp, #36] @ 8-byte Folded Spill +; CHECK-NEXT: add r1, sp, #168 +; CHECK-NEXT: vldrw.u32 q5, [r2] +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vmov r9, r6, d10 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: strd r0, r1, [sp, #12] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: vmov r7, r5, d11 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov r8, r10, d8 +; CHECK-NEXT: mov r11, r1 +; CHECK-NEXT: mov r2, r7 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r11 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: add r3, sp, #20 +; CHECK-NEXT: strd r1, r0, [sp, #4] @ 8-byte Folded Spill +; CHECK-NEXT: ldm r3, {r0, r1, r2, r3} @ 16-byte Folded Reload +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r11, r0 +; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: ldrd r2, r3, [sp, #12] @ 8-byte Folded Reload +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: mov r0, r7 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: ldrd r0, r1, [sp, #128] +; CHECK-NEXT: ldrd r2, r3, [sp, #44] @ 8-byte Folded Reload +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: ldrd r1, r0, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: mov r2, r5 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: vmov d9, r0, r1 +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r1, r8 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: vmov d8, r0, r1 +; CHECK-NEXT: ldrd r1, r0, [sp, #36] @ 8-byte Folded Reload +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r6 +; CHECK-NEXT: vmov d11, r0, r1 +; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: mov r0, r11 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: vmov d10, r0, r1 +; CHECK-NEXT: ldr r0, [sp, #52] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q4, [r0, #16] +; CHECK-NEXT: vstrw.32 q5, [r0] +; CHECK-NEXT: add sp, #56 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %b.imag, %a.real + %1 = fmul fast <2 x double> %b.real, %a.imag + %2 = fadd fast <2 x double> %1, %0 + %3 = fmul fast <2 x double> %b.real, %a.real + %4 = fmul fast <2 x double> %a.imag, %b.imag + %5 = fsub fast <2 x double> %3, %4 + %interleaved.vec = shufflevector <2 x double> %5, <2 x double> %2, <4 x i32> + ret <4 x double> %interleaved.vec +} + +define <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) #0 { +; CHECK-LABEL: complex_mul_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #128 +; CHECK-NEXT: sub sp, #128 +; CHECK-NEXT: add r1, sp, #320 +; CHECK-NEXT: mov r9, r2 +; CHECK-NEXT: str r2, [sp, #48] @ 4-byte Spill +; CHECK-NEXT: add r2, sp, #256 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vldrw.u32 q5, [r2] +; CHECK-NEXT: str r0, [sp, #124] @ 4-byte Spill +; CHECK-NEXT: mov r8, r3 +; CHECK-NEXT: vmov r0, r10, d9 +; CHECK-NEXT: str r3, [sp, #52] @ 4-byte Spill +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: str r0, [sp, #120] @ 4-byte Spill +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: strd r2, r3, [sp, #112] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: strd r2, r3, [sp, #96] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r1, [sp, #72] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: add r2, sp, #240 +; CHECK-NEXT: add r1, sp, #304 +; CHECK-NEXT: vldrw.u32 q5, [r2] +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov r2, r3, d10 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: strd r2, r3, [sp, #80] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r1, [sp, #104] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: vmov r0, r1, d8 +; CHECK-NEXT: strd r2, r3, [sp, #64] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r1, [sp, #40] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: vmov d9, r4, r5 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: add r0, sp, #288 +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: mov r2, r9 +; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: vmov r0, r1, d11 +; CHECK-NEXT: strd r0, r1, [sp, #88] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: ldrd r2, r3, [sp, #232] +; CHECK-NEXT: strd r0, r1, [sp, #16] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: vmov d11, r4, r5 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: strd r1, r0, [sp, #32] @ 8-byte Folded Spill +; CHECK-NEXT: add r2, sp, #272 +; CHECK-NEXT: add r1, sp, #336 +; CHECK-NEXT: vldrw.u32 q7, [r2] +; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vmov r2, r3, d14 +; CHECK-NEXT: vmov r0, r1, d13 +; CHECK-NEXT: strd r2, r3, [sp, #24] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r1, [sp, #56] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: vmov r8, r11, d12 +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: vmov r2, r3, d15 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: strd r2, r3, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r4 +; CHECK-NEXT: mov r3, r7 +; CHECK-NEXT: bl __aeabi_dadd +; CHECK-NEXT: strd r1, r0, [sp] @ 8-byte Folded Spill +; CHECK-NEXT: ldrd r0, r1, [sp, #72] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r2, r3, [sp, #112] @ 8-byte Folded Reload +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: ldrd r0, r1, [sp, #96] @ 8-byte Folded Reload +; CHECK-NEXT: mov r3, r10 +; CHECK-NEXT: ldr r2, [sp, #120] @ 4-byte Reload +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: mov r1, r7 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: ldrd r0, r1, [sp, #40] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r2, r3, [sp, #80] @ 8-byte Folded Reload +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r9, r0 +; CHECK-NEXT: mov r10, r1 +; CHECK-NEXT: ldrd r0, r1, [sp, #64] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r2, r3, [sp, #104] @ 8-byte Folded Reload +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r0, r9 +; CHECK-NEXT: mov r1, r10 +; CHECK-NEXT: vmov d8, r4, r7 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: mov r4, r0 +; CHECK-NEXT: mov r7, r1 +; CHECK-NEXT: ldrd r0, r1, [sp, #16] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r2, r3, [sp, #48] @ 8-byte Folded Reload +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r6, r0 +; CHECK-NEXT: mov r5, r1 +; CHECK-NEXT: ldrd r0, r1, [sp, #232] +; CHECK-NEXT: ldrd r2, r3, [sp, #88] @ 8-byte Folded Reload +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: ldrd r1, r0, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: vmov d10, r4, r7 +; CHECK-NEXT: vmov d15, r0, r1 +; CHECK-NEXT: ldrd r1, r0, [sp, #32] @ 8-byte Folded Reload +; CHECK-NEXT: vmov d13, r0, r1 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: mov r1, r5 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: ldrd r2, r3, [sp, #24] @ 8-byte Folded Reload +; CHECK-NEXT: mov r7, r0 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r1, r11 +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r5, r0 +; CHECK-NEXT: mov r6, r1 +; CHECK-NEXT: ldrd r0, r1, [sp, #8] @ 8-byte Folded Reload +; CHECK-NEXT: ldrd r2, r3, [sp, #56] @ 8-byte Folded Reload +; CHECK-NEXT: bl __aeabi_dmul +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r1, r6 +; CHECK-NEXT: bl __aeabi_dsub +; CHECK-NEXT: vmov d14, r0, r1 +; CHECK-NEXT: ldr r0, [sp, #124] @ 4-byte Reload +; CHECK-NEXT: vmov d12, r7, r4 +; CHECK-NEXT: vstrw.32 q7, [r0, #48] +; CHECK-NEXT: vstrw.32 q4, [r0, #32] +; CHECK-NEXT: vstrw.32 q5, [r0, #16] +; CHECK-NEXT: vstrw.32 q6, [r0] +; CHECK-NEXT: add sp, #128 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %0 = fmul fast <4 x double> %b.imag, %a.real + %1 = fmul fast <4 x double> %b.real, %a.imag + %2 = fadd fast <4 x double> %1, %0 + %3 = fmul fast <4 x double> %b.real, %a.real + %4 = fmul fast <4 x double> %a.imag, %b.imag + %5 = fsub fast <4 x double> %3, %4 + %interleaved.vec = shufflevector <4 x double> %5, <4 x double> %2, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Transform functions to use DSP intrinsics +; CHECK-NEXT: Complex Arithmetic Pass ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: Type Promotion ; CHECK-NEXT: CodeGen Prepare