diff --git a/llvm/include/llvm/CodeGen/ComplexArithmetic.h b/llvm/include/llvm/CodeGen/ComplexArithmetic.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/CodeGen/ComplexArithmetic.h @@ -0,0 +1,74 @@ +//===- ComplexArithmetic.h - Complex Arithmetic Pass --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements generation of target-specific intrinsics to support +// handling of complex number arithmetic. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_COMPLEXARITHMETIC_H +#define LLVM_TRANSFORMS_SCALAR_COMPLEXARITHMETIC_H + +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" + +#ifndef ENUM_FLAG_DEF +#define ENUM_FLAG_DEF(Type) \ + inline Type operator~(Type a) { return (Type) ~(int)a; } \ + inline Type operator|(Type a, Type b) { return (Type)((int)a | (int)b); } \ + inline Type operator&(Type a, Type b) { return (Type)((int)a & (int)b); } \ + inline Type operator^(Type a, Type b) { return (Type)((int)a ^ (int)b); } \ + inline Type &operator|=(Type &a, Type b) { \ + return (Type &)((int &)a |= (int)b); \ + } \ + inline Type &operator&=(Type &a, Type b) { \ + return (Type &)((int &)a &= (int)b); \ + } \ + inline Type &operator^=(Type &a, Type b) { \ + return (Type &)((int &)a ^= (int)b); \ + } +#endif + +namespace llvm { + +class Function; +class TargetMachine; + +struct ComplexArithmeticPass : public PassInfoMixin { +private: + TargetMachine *TM; + +public: + ComplexArithmeticPass(TargetMachine *TM) : TM(TM) {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +enum class ComplexArithmeticOperation { + None = 0, + CAdd = 1 << 0, + CMul = 1 << 1, + CMulPartial = 1 << 2 +}; + +ENUM_FLAG_DEF(ComplexArithmeticOperation) + +struct ComplexArithmeticData { +public: + ComplexArithmeticOperation OperationType; + unsigned Rotation; + Instruction *I; + + bool isOperationType(ComplexArithmeticOperation Op) { + return (OperationType & Op) == Op; + } +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_COMPLEXARITHMETIC_H diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -82,6 +82,13 @@ /// matching during instruction selection. FunctionPass *createCodeGenPreparePass(); + //===----------------------------------------------------------------------===// + // + // This pass implements generation of target-specific intrinsics to support + // handling of complex number arithmetic + // + FunctionPass *createComplexArithmeticPass(const TargetMachine *TM); + /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg /// load-linked/store-conditional loops. extern char &AtomicExpandID; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -22,6 +22,7 @@ #ifndef LLVM_CODEGEN_TARGETLOWERING_H #define LLVM_CODEGEN_TARGETLOWERING_H +#include "ComplexArithmetic.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -2959,6 +2960,16 @@ return isOperationLegalOrCustom(Op, VT); } + virtual bool supportsComplexArithmetic() const { + return false; + } + + virtual Value *createComplexArithmeticIR(ComplexArithmeticData &Data, + Value *InputA, Value *InputB, + int &GeneratedIntrinsicCount) const { + return nullptr; + } + //===--------------------------------------------------------------------===// // Runtime Library hooks // diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -114,9 +114,10 @@ void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); void initializeCheckDebugMachineModulePass(PassRegistry &); -void initializeCodeGenPreparePass(PassRegistry&); -void initializeConstantHoistingLegacyPassPass(PassRegistry&); -void initializeConstantMergeLegacyPassPass(PassRegistry&); +void initializeCodeGenPreparePass(PassRegistry &); +void initializeComplexArithmeticLegacyPassPass(PassRegistry &); +void initializeConstantHoistingLegacyPassPass(PassRegistry &); +void initializeConstantMergeLegacyPassPass(PassRegistry &); void initializeConstraintEliminationPass(PassRegistry &); void initializeControlHeightReductionLegacyPassPass(PassRegistry&); void initializeCorrelatedValuePropagationPass(PassRegistry&); diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -562,13 +562,13 @@ // FunctionPass *createInstSimplifyLegacyPass(); - //===----------------------------------------------------------------------===// // // createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather // and scatter intrinsics with scalar code when target doesn't support them. // FunctionPass *createScalarizeMaskedMemIntrinLegacyPass(); -} // End llvm namespace + +} // namespace llvm #endif diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -44,6 +44,7 @@ CodeGenPassBuilder.cpp CodeGenPrepare.cpp CommandFlags.cpp + ComplexArithmetic.cpp CriticalAntiDepBreaker.cpp DeadMachineInstructionElim.cpp DetectDeadLanes.cpp diff --git a/llvm/lib/CodeGen/ComplexArithmetic.cpp b/llvm/lib/CodeGen/ComplexArithmetic.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/CodeGen/ComplexArithmetic.cpp @@ -0,0 +1,841 @@ +//===- ComplexArithmeticPass.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ComplexArithmetic.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "complex-arithmetic" + +STATISTIC(NumComplexIntrinsics, "Number of complex intrinsics generated"); + +static cl::opt ComplexArithmeticEnabled( + "enable-complex-arithmetic", + cl::desc("Enable generation of complex arithmetic instructions"), + cl::init(true), cl::Hidden); + +namespace { + +/** + * Creates a contiguous mask of the given length, optionally with a base offset. + */ +static ArrayRef createContiguousMask(int len, int offset = 0) { + int *Arr = new int[len]; + for (int j = 0; j < len; j++) + Arr[j] = j + offset; + return ArrayRef(Arr, len); +} + +static ArrayRef createArrayWithStep(int len, int step, int offset) { + int *Arr = new int[len]; + for (int j = 0; j < len; j++) + Arr[j] = (j * step) + offset; + return ArrayRef(Arr, len); +} + +/** + * Creates an interleaving mask of the given length. + */ +static ArrayRef createInterleavingMask(int len) { + int Step = len / 2; + int *Arr = new int[len]; + int idx = 0; + for (int j = 0; j < len; j += 2) { + Arr[j] = idx; + Arr[j + 1] = idx + Step; + idx++; + } + return ArrayRef(Arr, len); +} + +static ArrayRef createDeinterleavingMask(int len, int offset = 0) { + return createArrayWithStep(len, 2, offset); +} + +class ComplexArithmeticLegacyPass : public FunctionPass { +public: + static char ID; + + ComplexArithmeticLegacyPass(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) { + initializeComplexArithmeticLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "Complex Arithmetic Pass"; } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + +private: + const TargetMachine *TM; +}; + +namespace { +/** + * Annotated graph-like structure that enriches the existing Instruction graph, + * allowing for contextual clues relevant to complex arithmetic to be provided + * and given to TTI hooks as required. + */ +class ComplexArithmeticGraph { +public: + /** + * Bitflags denoting the type of the instruction node. + */ + enum NodeType { + Unknown = 0, + // Actual node types + + Real = 1, + Imaginary = 2, + Load = 4, + Store = 8, + Shuffle = 16, + AddOperand = 32, + Input = 64, + Preserve = 128, + + // Meta node types, defining additional behaviour upon node creation + + /** + * Will cause the node to look at parents to try and identify the type. + * Parents must already be registered and identified. + */ + Discover = 0xffff, + }; + + struct Node { + public: + Instruction *I; + NodeType NType; + + Node(Instruction *i, enum NodeType nodeType) : I(i), NType(nodeType) {} + }; + + /** + * Returns a copy of the vector of all registered nodes. + */ + SmallVector getAllNodes() { + SmallVector Is; + for (auto &N : Nodes) + Is.push_back(N->I); + return Is; + } + + SmallVector getAllNodesToReplace() { + SmallVector Is; + for (auto &N : Nodes) { + if ((N->NType & Preserve) == Preserve) + continue; + Is.push_back(N->I); + } + return Is; + } + + /** + * Returns a vector of all registered nodes that are of the given type. + */ + SmallVector getNodesOfType(enum NodeType Type) { + SmallVector Is; + for (auto &N : Nodes) { + if ((N->NType & Type) == Type) + Is.push_back(N->I); + } + return Is; + } + + /** + * Returns the node type of I. It must already be registered and identified, + * otherwise `Unknown` is returned. + */ + enum NodeType getNodeType(Instruction *I) { + auto *N = getNode(I); + if (N == nullptr) + return Unknown; + return N->NType; + } + + /** + * Registers and identifies the given Instruction, optionally with the + * provided NodeType. + */ + void addNode(Instruction *I, enum NodeType NodeType = NodeType::Unknown) { + if ((NodeType & Discover) == Discover) { + auto LeftType = getNodeType(cast(I->getOperand(0))); + auto RightType = getNodeType(cast(I->getOperand(1))); + + if (LeftType == Unknown || RightType == Unknown) { + NodeType = Unknown; + } else { + if (I->getOpcode() == Instruction::FMul) { + if (LeftType == RightType) + NodeType = Real; + else + NodeType = Imaginary; + } else { + NodeType = LeftType; + } + } + } + + auto *Existing = getNode(I); + if (Existing != nullptr) { + if (Existing->NType == NodeType) + return; + LLVM_DEBUG(I->dump()); + llvm_unreachable( + "A node has been added twice, with conflicting nodetypes."); + } + + auto N = std::make_unique(I, NodeType); + Nodes.push_back(std::move(N)); + } + + LLVMContext &getContext() { return CurrentI->getContext(); } + + Instruction *getCurrentInstruction() { return CurrentI; } + + void setCurrentInstruction(Instruction *I) { CurrentI = I; } + + void setType(enum ComplexArithmeticOperation type) { Type = type; } + + enum ComplexArithmeticOperation getType() { return Type; } + + void setRotation(unsigned R) { Rotation = R; } + + unsigned getRotation() { return Rotation; } + + /** + * Sets the graph userdata pointer. The graph then assumes ownership of the + * pointer, and will free it on deconstruction. + */ + template void setUserData(T *Ptr) { + UserData = std::shared_ptr(Ptr); + } + + /** + * Gets the graph userdata pointer, casting it to T. + * + * Note: No checks are made by the graph to ensure the type of the data is as + * requested. It is up to the caller to check for that. + */ + template T *getUserData() { + if (UserData == nullptr) + return nullptr; + auto *Ptr = UserData.get(); + return (T *)Ptr; + } + + ComplexArithmeticData getData() { + ComplexArithmeticData d; + d.OperationType = getType(); + d.I = getCurrentInstruction(); + d.Rotation = getRotation(); + return d; + } + +private: + unsigned Rotation = 0; + ComplexArithmeticOperation Type = ComplexArithmeticOperation::None; + Instruction *CurrentI = nullptr; + // std::unique_ptr doesn't support void* without an explicit deleter + std::shared_ptr UserData; + + Node *getNode(Instruction *I) { + for (const auto &item : Nodes) { + if (item->I == I) + return item.get(); + } + return nullptr; + } + + SmallVector> Nodes; +}; + +ENUM_FLAG_DEF(ComplexArithmeticGraph::NodeType) + +}; // namespace + +class ComplexArithmetic { +public: + ComplexArithmetic(const TargetLowering *tli) : TLI(tli) {} + bool runOnFunction(Function &F); + +private: + bool + evaluateComplexArithmeticBasicBlock(BasicBlock *B, + SmallVector &DeadInsts); + + void cleanupDeadInsts(SmallVector &DeadInsts); + + const TargetLowering *TLI = nullptr; +}; + +}; // namespace + +char ComplexArithmeticLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ComplexArithmeticLegacyPass, DEBUG_TYPE, + "Complex Arithmetic", false, false) +INITIALIZE_PASS_END(ComplexArithmeticLegacyPass, DEBUG_TYPE, + "Complex Arithmetic", false, false) + +PreservedAnalyses ComplexArithmeticPass::run(Function &F, + FunctionAnalysisManager &AM) { + const TargetLowering *TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + if (!ComplexArithmetic(TLI).runOnFunction(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} + +FunctionPass *llvm::createComplexArithmeticPass(const TargetMachine *TM) { + return new ComplexArithmeticLegacyPass(TM); +} + +bool ComplexArithmeticLegacyPass::runOnFunction(Function &F) { + const auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering(); + return ComplexArithmetic(TLI).runOnFunction(F); +} + +static bool HasBeenDisabled = false; +bool ComplexArithmetic::runOnFunction(Function &F) { + if (!ComplexArithmeticEnabled) { + LLVM_DEBUG(if (!HasBeenDisabled) dbgs() + << "Complex has been explicitly disabled.\n"); + return false; + } + + if (!TLI->supportsComplexArithmetic()) { + LLVM_DEBUG(if (!HasBeenDisabled) dbgs() + << "Complex has been disabled, " + "target does not support lowering of complex numbers.\n"); + return false; + } + + bool Changed = false; + SmallVector DeadInsts; + for (auto &B : F) + Changed |= evaluateComplexArithmeticBasicBlock(&B, DeadInsts); + + if (Changed) + cleanupDeadInsts(DeadInsts); + + return Changed; +} + +/** + * Checks the given mask, and determines whether said mask is interleaving. + * + * To be interleaving, a mask must alternate between `i` and `i + (Length / 2)`, + * and must contain all numbers within the range of `[0..Length)` + * (e.g. a 4x vector interleaving mask would be <0, 2, 1, 3>). + */ +static bool isInterleavingMask(ArrayRef Mask, int NumElements) { + if (Mask.size() != NumElements * 2) { + return false; + } + + for (unsigned i = 0; i < NumElements; ++i) { + if (Mask[(i * 2) + 1] != (Mask[i * 2] + NumElements)) { + return false; + } + } + + return true; +} + +/** + * Checks the mask of the given ShuffleVectorInst, and determines whether said + * shuffle is interleaving. See isInterleavingMask. + */ +static bool isInterleaving(ShuffleVectorInst *SVI) { + auto *Ty = dyn_cast(SVI->getOperand(0)->getType()); + if (!Ty) + return false; + + unsigned NumElements = Ty->getNumElements(); + return isInterleavingMask(SVI->getShuffleMask(), NumElements); +} + +/** + * Checks the given mask, and determines whether said mask is deinterleaving. + * + * To be deinterleaving, a mask must match the pattern `i * 2`, with an optional + * offset of 1. (e.g. a 4x vector deinterleaving mask would look like <0, 2, 4, + * 6> or <1, 3, 5, 7>). + */ +static bool isDeinterleavingMask(ArrayRef Mask, int NumElements) { + if (Mask.size() != NumElements) + return false; + + for (unsigned i = 0; i < Mask.size() - 1; ++i) { + if (Mask[i + 1] != (Mask[i] + NumElements)) + return false; + } + + return true; +} + +static bool matchComplexPartialMul(ShuffleVectorInst *SVI, + ComplexArithmeticGraph &G) { + auto InterleavingMask = createInterleavingMask(SVI->getShuffleMask().size()); + auto DeinterleavingLength = InterleavingMask.size() / 2; + auto DeinterleavingRealMask = + createDeinterleavingMask(DeinterleavingLength, 0); + auto DeinterleavingImagMask = + createDeinterleavingMask(DeinterleavingLength, 1); + + Value *LoadA, *LoadB; + Value *AssertLoadA, *AssertLoadB; + + bool AreOp0Shuffles = true; + auto MulByRealPatternA = + m_Shuffle(m_Shuffle(m_FMul(m_Value(LoadB), m_Value(LoadA)), m_Poison(), + m_SpecificMask(DeinterleavingRealMask)), + m_FMul(m_Shuffle(m_Value(AssertLoadB), m_Poison(), + m_SpecificMask(DeinterleavingRealMask)), + m_Shuffle(m_Value(AssertLoadA), m_Poison(), + m_SpecificMask(DeinterleavingImagMask))), + m_SpecificMask(InterleavingMask)); + + if (!match(SVI, MulByRealPatternA)) { + LLVM_DEBUG(dbgs() << "Failed to match MulByReal pattern.\n"; SVI->dump(); + SVI->getParent()->dump()); + return false; + } + + if (LoadA != AssertLoadA || LoadB != AssertLoadB) { + LLVM_DEBUG(dbgs() << "Loads don't match expected pattern" + << ".\n"); + return false; + } + + G.addNode(cast(LoadA), ComplexArithmeticGraph::Load | + ComplexArithmeticGraph::Input | + ComplexArithmeticGraph::Preserve); + G.addNode(cast(LoadB), ComplexArithmeticGraph::Load | + ComplexArithmeticGraph::Input | + ComplexArithmeticGraph::Preserve); + + G.addNode(SVI, ComplexArithmeticGraph::Shuffle); + G.setRotation(0); + + auto *Op0 = cast(SVI->getOperand(0)); + auto *Op1 = cast(SVI->getOperand(1)); + + G.addNode(Op0, ComplexArithmeticGraph::Real); + G.addNode(Op1, ComplexArithmeticGraph::Imaginary); + + G.addNode(cast(Op1->getOperand(0)), + ComplexArithmeticGraph::Real | ComplexArithmeticGraph::Shuffle); + G.addNode(cast(Op1->getOperand(1)), + ComplexArithmeticGraph::Imaginary | + ComplexArithmeticGraph::Shuffle); + + G.setType(llvm::ComplexArithmeticOperation::CMulPartial); + + return true; +} + +static bool matchComplexMul(ShuffleVectorInst *SVI, + ComplexArithmeticGraph &G) { + unsigned LikelyRotation = 0; + + Value *LeftShuffleAR; + Value *LeftShuffleAI; + Value *LeftShuffleBR; + Value *LeftShuffleBI; + + Value *RightShuffleAR; + Value *RightShuffleAI; + Value *RightShuffleBR; + Value *RightShuffleBI; + + auto Mask = createInterleavingMask(SVI->getShuffleMask().size()); + + auto InterleaveShuffleRot0Pattern = m_Shuffle( + m_FSub(m_FMul(m_Value(LeftShuffleBR), m_Value(LeftShuffleAR)), + m_FMul(m_Value(LeftShuffleBI), m_Value(LeftShuffleAI))), + m_FAdd(m_FMul(m_Value(RightShuffleBI), m_Value(RightShuffleAR)), + m_FMul(m_Value(RightShuffleBR), m_Value(RightShuffleAI))), + m_SpecificMask(Mask)); + + auto InterleaveShuffleRot180Pattern = m_Shuffle( + m_FSub(m_FMul(m_Value(LeftShuffleBI), m_Value(LeftShuffleAI)), + m_FMul(m_Value(LeftShuffleBR), m_Value(LeftShuffleAR))), + m_FSub(m_FMul(m_Value(RightShuffleBR), m_FNeg(m_Value(RightShuffleAI))), + m_FMul(m_Value(RightShuffleBI), m_Value(RightShuffleAR))), + m_SpecificMask(Mask)); + + if (match(SVI, InterleaveShuffleRot0Pattern)) + LikelyRotation = 0; + else if (match(SVI, InterleaveShuffleRot180Pattern)) + LikelyRotation = 180; + else { + LLVM_DEBUG(dbgs() << "SVI does not match expected patterns.\n"); + return false; + } + + if (LeftShuffleAR != RightShuffleAR) + return false; + if (LeftShuffleAI != RightShuffleAI) + return false; + if (LeftShuffleBR != RightShuffleBR) + return false; + if (LeftShuffleBI != RightShuffleBI) + return false; + + G.addNode(cast(LeftShuffleAR), + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + G.addNode(cast(LeftShuffleAI), + ComplexArithmeticGraph::Shuffle | + ComplexArithmeticGraph::Imaginary); + G.addNode(cast(LeftShuffleBR), + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + G.addNode(cast(LeftShuffleBI), + ComplexArithmeticGraph::Shuffle | + ComplexArithmeticGraph::Imaginary); + + G.addNode(cast(cast(LeftShuffleAR)->getOperand(0)), + ComplexArithmeticGraph::Input | ComplexArithmeticGraph::Preserve); + G.addNode(cast(cast(LeftShuffleBR)->getOperand(0)), + ComplexArithmeticGraph::Input | ComplexArithmeticGraph::Preserve); + + auto *Op0 = cast(SVI->getOperand(0)); + auto *FAdd = cast(SVI->getOperand(1)); + + G.addNode(Op0, ComplexArithmeticGraph::Real); + G.addNode(FAdd, ComplexArithmeticGraph::Imaginary); + + G.addNode(cast(Op0->getOperand(0)), + ComplexArithmeticGraph::Discover); + G.addNode(cast(Op0->getOperand(1)), + ComplexArithmeticGraph::Discover); + G.addNode(cast(FAdd->getOperand(0)), + ComplexArithmeticGraph::Discover); + G.addNode(cast(FAdd->getOperand(1)), + ComplexArithmeticGraph::Discover); + + G.setType(llvm::ComplexArithmeticOperation::CMul); + G.setRotation(LikelyRotation); + + return true; +} + +static bool matchComplexAdd(ShuffleVectorInst *SVI, + ComplexArithmeticGraph &G) { + Value *ShuffleAR; + Value *ShuffleAI; + Value *ShuffleBR; + Value *ShuffleBI; + + auto *Op0 = dyn_cast(SVI->getOperand(0)); + auto *Op1 = dyn_cast(SVI->getOperand(1)); + + if (!Op0 || !Op1) + return false; + + unsigned Rotation; + // TODO Support Rotations 0 and 180 + if (Op0->getOpcode() == Instruction::FSub && + Op1->getOpcode() == Instruction::FAdd) { + Rotation = 90; + } else if (Op0->getOpcode() == Instruction::FAdd && + Op1->getOpcode() == Instruction::FSub) { + Rotation = 270; + } else { + return false; + } + + auto ShuffleMask = createInterleavingMask(SVI->getShuffleMask().size()); + + if (Rotation == 90) { + if (!match(SVI, m_Shuffle(m_FSub(m_Value(ShuffleAR), m_Value(ShuffleBI)), + m_FAdd(m_Value(ShuffleAI), m_Value(ShuffleBR)), + m_SpecificMask(ShuffleMask)))) { + LLVM_DEBUG( + dbgs() << "SVI does not match expected pattern for complex add rot " + << Rotation << ".\n"); + return false; + } + } else if (Rotation == 270) { + if (!match(SVI, m_Shuffle(m_FAdd(m_Value(ShuffleBI), m_Value(ShuffleAR)), + m_FSub(m_Value(ShuffleAI), m_Value(ShuffleBR)), + m_SpecificMask(ShuffleMask)))) { + LLVM_DEBUG( + dbgs() << "SVI does not match expected pattern for complex add rot " + << Rotation << ".\n"); + return false; + } + } + + if (!isa(ShuffleAR) || + !isa(ShuffleAI) || + !isa(ShuffleBR) || + !isa(ShuffleAI)) { + LLVM_DEBUG(dbgs() << "SVI does not match expected pattern for complex add, " + "inputs aren't all shuffles.\n"); + return false; + } + + auto *InputA = cast(ShuffleAR)->getOperand(0); + auto *InputB = cast(ShuffleBR)->getOperand(0); + + if (!isa(InputA) || !isa(InputB)) { + LLVM_DEBUG(dbgs() << "Evaluated inputs aren't instructions.\n"); + return false; + } + + G.addNode(cast(InputA), ComplexArithmeticGraph::Input | + ComplexArithmeticGraph::Preserve); + G.addNode(cast(InputB), ComplexArithmeticGraph::Input | + ComplexArithmeticGraph::Preserve); + + G.addNode(cast(ShuffleAR), + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + G.addNode(cast(ShuffleAI), + ComplexArithmeticGraph::Shuffle | + ComplexArithmeticGraph::Imaginary); + G.addNode(cast(ShuffleBR), + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + G.addNode(cast(ShuffleBI), + ComplexArithmeticGraph::Shuffle | + ComplexArithmeticGraph::Imaginary); + + G.addNode(Op0, ComplexArithmeticGraph::AddOperand); + G.addNode(Op1, ComplexArithmeticGraph::AddOperand); + + G.setType(ComplexArithmeticOperation::CAdd); + G.setRotation(Rotation); + + return true; +} + +static bool traverseAndPopulateGraph(const TargetLowering *TLI, Instruction *I, + ComplexArithmeticGraph &G) { + G.setCurrentInstruction(I); + + // Shuffle mask needs to interleave vectors + // e.g. + // <4 x i32> <0, 2, 1, 3> + // <8 x i32> <0, 4, 1, 5, 2, 6, 3, 7> + + if (auto *SVI = dyn_cast(I)) { + if (!isInterleaving(SVI)) { + LLVM_DEBUG(dbgs() << "SVI doesn't appear to perform interleaving" + << ".\n"); + return false; + } + + if (matchComplexPartialMul(SVI, G)) + return true; + + if (matchComplexMul(SVI, G)) + return true; + + if (matchComplexAdd(SVI, G)) + return true; + + // if (TLI->matchComplexArithmeticIR(SVI, G)) + // return true; + } + + return false; +} + +static bool substituteGraph(const TargetLowering *TLI, Instruction *I, + ComplexArithmeticGraph &G, + SmallVector &DeadInsts) { + G.setCurrentInstruction(I); + + SmallVector Inputs = + G.getNodesOfType(ComplexArithmeticGraph::Input); + + auto *LoadA = Inputs[0]; + auto *LoadB = Inputs[1]; + + auto *TyA = cast(LoadA->getType()); + auto *TyB = cast(LoadB->getType()); + + FixedVectorType *WideType; + FixedVectorType *NarrowType; + if (TyA->getNumElements() >= TyB->getNumElements()) { + WideType = TyA; + NarrowType = TyB; + } else { + WideType = TyB; + NarrowType = TyA; + } + + if (NarrowType->getNumElements() != WideType->getNumElements() && + NarrowType->getNumElements() != WideType->getNumElements() / 2) { + LLVM_DEBUG( + dbgs() + << "Narrow type is not equal to or half the width of the wide type" + << ".\n"); + return false; + } + + unsigned WideStride; + unsigned NarrowStride; + + const unsigned MaxVectorWidth = 128; + + unsigned NumBits = + WideType->getScalarSizeInBits() * WideType->getNumElements(); + WideStride = MaxVectorWidth / WideType->getScalarSizeInBits(); + if (NarrowType->getNumElements() == WideType->getNumElements()) + NarrowStride = WideStride; + else + NarrowStride = WideType->getNumElements() / WideStride; + + int GeneratedIntrinsics; + auto GraphData = G.getData(); + if (NumBits > MaxVectorWidth) { + LLVM_DEBUG(dbgs() << "Split required, " << NumBits + << " is greater than the max vector width (" + << MaxVectorWidth << ")" + << ".\n"); + if (NumBits % MaxVectorWidth != 0) { + LLVM_DEBUG(dbgs() << "Vector can't be split evenly" + << ".\n"); + return false; + } + + IRBuilder<> B(I); + + unsigned SplitCount = NumBits / MaxVectorWidth; + + if (SplitCount > 2) { + LLVM_DEBUG(dbgs() << "Cannot split operation beyond 2" + << ".\n"); + return false; + } + + SmallVector CreatedInsts; + SmallVector ComplexIR; + for (unsigned i = 0; i < SplitCount; ++i) { + ArrayRef WideMask = createContiguousMask(WideStride, WideStride * i); + ArrayRef NarrowMask = + createContiguousMask(NarrowStride, NarrowStride * i); + + auto *Undef = UndefValue::get(LoadA->getType()); + auto *Undef2 = UndefValue::get(LoadB->getType()); + Value *ShuffleA, *ShuffleB; + if (TyA == WideType) { + ShuffleA = B.CreateShuffleVector( + LoadA, Undef, WideMask.take_front(TyA->getNumElements() / 2)); + ShuffleB = B.CreateShuffleVector( + LoadB, Undef2, NarrowMask.take_front(TyB->getNumElements() / 2)); + } else { + ShuffleA = B.CreateShuffleVector( + LoadB, Undef, WideMask.take_front(TyB->getNumElements() / 2)); + ShuffleB = B.CreateShuffleVector( + LoadA, Undef2, NarrowMask.take_front(TyA->getNumElements() / 2)); + } + + CreatedInsts.push_back(ShuffleA); + CreatedInsts.push_back(ShuffleB); + + auto *IR = TLI->createComplexArithmeticIR(GraphData, ShuffleA, ShuffleB, + GeneratedIntrinsics); + if (IR == nullptr) { + for (auto &item : CreatedInsts) + DeadInsts.push_back(cast(item)); + return false; + } + NumComplexIntrinsics += GeneratedIntrinsics; + ComplexIR.push_back(IR); + CreatedInsts.push_back(IR); + } + ArrayRef Mask = createContiguousMask(WideStride * 2); + auto *Shuffle = B.CreateShuffleVector(ComplexIR[0], ComplexIR[1], Mask); + I->replaceAllUsesWith(Shuffle); + } else { + auto *Mla = TLI->createComplexArithmeticIR(GraphData, LoadA, LoadB, + GeneratedIntrinsics); + if (Mla == nullptr) + return false; + NumComplexIntrinsics += GeneratedIntrinsics; + I->replaceAllUsesWith(Mla); + } + + for (auto &item : G.getAllNodesToReplace()) + DeadInsts.push_back(item); + + return true; +} + +bool ComplexArithmetic::evaluateComplexArithmeticBasicBlock( + BasicBlock *B, SmallVector &DeadInsts) { + ComplexArithmeticGraph Graph; + + bool Changed = false; + bool Substituted = false; + + for (auto &I : *B) { + if (auto *SVI = dyn_cast(&I)) { + if (isInterleaving(SVI)) { + Graph.addNode(SVI, ComplexArithmeticGraph::Shuffle); + Changed = traverseAndPopulateGraph(TLI, SVI, Graph); + } + } + if (Changed) { + LLVM_DEBUG(dbgs() << "Trying to substitute graph in block: \n"; + B->dump();); + Substituted = substituteGraph(TLI, &I, Graph, DeadInsts); + if (Substituted) + LLVM_DEBUG(dbgs() << "Block now looks like: \n"; B->dump();); + Changed = false; + } + } + + return Substituted; +} + +void ComplexArithmetic::cleanupDeadInsts( + SmallVector &DeadInsts) { + + // TODO clean up the dead instructions better. (Ask in review?) + unsigned iter = 0; + unsigned count = DeadInsts.size(); + unsigned remaining = DeadInsts.size(); + while (!DeadInsts.empty() && remaining > 0 && iter < count) { + ++iter; + remaining = 0; + for (auto *It = DeadInsts.begin(); It != DeadInsts.end(); It++) { + auto *I = *It; + + if (I->getParent()) + remaining++; + + if (I->getNumUses() == 0 && I->getParent()) { + remaining--; + I->eraseFromParent(); + } + } + } + + DeadInsts.clear(); +} diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -739,6 +739,13 @@ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool supportsComplexArithmetic() const override; + + Value * + createComplexArithmeticIR(ComplexArithmeticData &Data, Value *InputA, + Value *InputB, + int &GeneratedIntrinsicCount) const override; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21738,3 +21738,85 @@ MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } + +bool +ARMTargetLowering::supportsComplexArithmetic() const { + return Subtarget->hasMVEFloatOps(); +} + +Value *ARMTargetLowering::createComplexArithmeticIR( + ComplexArithmeticData &Data, Value *InputA, Value *InputB, + int &GeneratedIntrinsicCount) const { + auto *Ty = InputA->getType(); + if (!isa(Ty)) + return nullptr; + auto *VTy = cast(Ty); + + // Cannot widen complex intrinsics to fill vectors + if (VTy->getNumElements() * VTy->getScalarSizeInBits() != 128) + return nullptr; + + // MVE does not support double complex operations + if (VTy->getScalarType()->isDoubleTy()) + return nullptr; + + IRBuilder<> B(Data.I); + auto *IntTy = Type::getInt32Ty(B.getContext()); + + if (Data.isOperationType(ComplexArithmeticOperation::CMulPartial)) { + + auto *TyA = InputA->getType(); + auto *TyB = InputB->getType(); + + ConstantInt *ConstMulRot = nullptr; + + if (Data.Rotation == 0) + ConstMulRot = ConstantInt::get(IntTy, 0); + else if (Data.Rotation == 180) + ConstMulRot = ConstantInt::get(IntTy, 2); + + if (!ConstMulRot) + return nullptr; + + auto *Mul = B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty, + {ConstMulRot, InputB, InputA}); + GeneratedIntrinsicCount = 1; + return Mul; + } + + if (Data.isOperationType(ComplexArithmeticOperation::CMul)) { + + int RotIdx = Data.Rotation / 90; + + auto *ConstMulRot = ConstantInt::get(IntTy, RotIdx); + auto *ConstMlaRot = ConstantInt::get(IntTy, (RotIdx + 1) % 4); + auto *Mul = B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty, + {ConstMulRot, InputA, InputB}); + auto *Mla = B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty, + {ConstMlaRot, Mul, InputA, InputB}); + GeneratedIntrinsicCount = 2; + return Mla; + } + + if (Data.isOperationType(ComplexArithmeticOperation::CAdd)) { + + // 1 means the value is not halved. + unsigned HalvingVal = 1; + auto *Halving = ConstantInt::get(IntTy, HalvingVal); + + unsigned RotKey; + if (Data.Rotation == 90) + RotKey = 0; + else if (Data.Rotation == 270) + RotKey = 1; + else + return nullptr; // Invalid rotation for arm_mve_vcaddq + + auto *RotVal = ConstantInt::get(IntTy, RotKey); + GeneratedIntrinsicCount = 1; + return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty, + {Halving, RotVal, InputA, InputB}); + } + + return nullptr; +} diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -425,12 +425,17 @@ TargetPassConfig::addIRPasses(); // Run the parallel DSP pass. - if (getOptLevel() == CodeGenOpt::Aggressive) + if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createARMParallelDSPPass()); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexArithmeticPass(TM)); + // Match interleaved memory accesses to ldN/stN intrinsics. - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedAccessPass()); + } // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -12,6 +12,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/CodeGen/ComplexArithmetic.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" @@ -20,8 +21,8 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-add.ll @@ -0,0 +1,273 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; STATS: "complex-arithmetic.NumComplexIntrinsics": 3 + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_add_v2f16(<2 x half>* %a, <2 x half>* %b, <2 x half>* %c) #0 { +; CHECK-LABEL: complex_add_v2f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r3, [r0] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: ldr r3, [r1] +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vsub.f16 s4, s4, s2 +; CHECK-NEXT: vadd.f16 s0, s6, s0 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: str r3, [r2] +; CHECK-NEXT: b .LBB0_1 + +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <2 x half>, <2 x half>* %a, align 4 + %strided.vec = shufflevector <2 x half> %wide.vec, <2 x half> zeroinitializer, <1 x i32> + %strided.vec21 = shufflevector <2 x half> %wide.vec, <2 x half> zeroinitializer, <1 x i32> + %wide.vec22 = load <2 x half>, <2 x half>* %b, align 4 + %strided.vec23 = shufflevector <2 x half> %wide.vec22, <2 x half> zeroinitializer, <1 x i32> + %strided.vec24 = shufflevector <2 x half> %wide.vec22, <2 x half> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x half> %strided.vec23, %strided.vec21 + %1 = fadd fast <1 x half> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> + store <2 x half> %interleaved.vec, <2 x half>* %c, align 4 + br label %vector.body +} + +define void @complex_add_v4f16(<4 x half>* %a, <4 x half>* %b, <4 x half>* %c) #0 { +; CHECK-LABEL: complex_add_v4f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldrd r3, r12, [r0] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.32 q0[1], r12 +; CHECK-NEXT: ldrd r3, r12, [r1] +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov.32 q2[0], r3 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vmov.32 q2[1], r12 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmovx.f16 s12, s8 +; CHECK-NEXT: vmovx.f16 s2, s9 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s8, s9 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vsub.f16 q1, q2, q1 +; CHECK-NEXT: vadd.f16 q0, q3, q0 +; CHECK-NEXT: vmovx.f16 s5, s4 +; CHECK-NEXT: vins.f16 s4, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmov r3, r12, d2 +; CHECK-NEXT: strd r3, r12, [r2] +; CHECK-NEXT: b .LBB1_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <4 x half>, <4 x half>* %a, align 4 + %strided.vec = shufflevector <4 x half> %wide.vec, <4 x half> zeroinitializer, <2 x i32> + %strided.vec21 = shufflevector <4 x half> %wide.vec, <4 x half> zeroinitializer, <2 x i32> + %wide.vec22 = load <4 x half>, <4 x half>* %b, align 4 + %strided.vec23 = shufflevector <4 x half> %wide.vec22, <4 x half> zeroinitializer, <2 x i32> + %strided.vec24 = shufflevector <4 x half> %wide.vec22, <4 x half> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x half> %strided.vec23, %strided.vec21 + %1 = fadd fast <2 x half> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> + store <4 x half> %interleaved.vec, <4 x half>* %c, align 4 + br label %vector.body +} + +define void @complex_add_v8f16(<8 x half>* %a, <8 x half>* %b, <8 x half>* %c) #0 { +; CHECK-LABEL: complex_add_v8f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vcadd.f16 q0, q1, q0, #90 +; CHECK-NEXT: vstrw.32 q0, [r2] +; CHECK-NEXT: b .LBB2_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <8 x half>, <8 x half>* %a, align 4 + %strided.vec = shufflevector <8 x half> %wide.vec, <8 x half> zeroinitializer, <4 x i32> + %strided.vec21 = shufflevector <8 x half> %wide.vec, <8 x half> zeroinitializer, <4 x i32> + %wide.vec22 = load <8 x half>, <8 x half>* %b, align 4 + %strided.vec23 = shufflevector <8 x half> %wide.vec22, <8 x half> zeroinitializer, <4 x i32> + %strided.vec24 = shufflevector <8 x half> %wide.vec22, <8 x half> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x half> %strided.vec23, %strided.vec21 + %1 = fadd fast <4 x half> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> + store <8 x half> %interleaved.vec, <8 x half>* %c, align 4 + br label %vector.body +} + +define void @complex_add_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 { +; CHECK-LABEL: complex_add_v16f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q3, [r1, #16] +; CHECK-NEXT: vcadd.f16 q1, q2, q1, #90 +; CHECK-NEXT: vcadd.f16 q0, q3, q0, #90 +; CHECK-NEXT: vstrw.32 q0, [r2, #16] +; CHECK-NEXT: vstrw.32 q1, [r2] +; CHECK-NEXT: b .LBB3_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <16 x half>, <16 x half>* %a, align 4 + %strided.vec = shufflevector <16 x half> %wide.vec, <16 x half> zeroinitializer, <8 x i32> + %strided.vec21 = shufflevector <16 x half> %wide.vec, <16 x half> zeroinitializer, <8 x i32> + %wide.vec22 = load <16 x half>, <16 x half>* %b, align 4 + %strided.vec23 = shufflevector <16 x half> %wide.vec22, <16 x half> zeroinitializer, <8 x i32> + %strided.vec24 = shufflevector <16 x half> %wide.vec22, <16 x half> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x half> %strided.vec23, %strided.vec21 + %1 = fadd fast <8 x half> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> + store <16 x half> %interleaved.vec, <16 x half>* %c, align 4 + br label %vector.body +} + +define void @complex_add_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 { +; CHECK-LABEL: complex_add_v32f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vmovx.f16 s2, s11 +; CHECK-NEXT: vmovx.f16 s5, s10 +; CHECK-NEXT: vldrw.u32 q3, [r1, #32] +; CHECK-NEXT: vins.f16 s5, s2 +; CHECK-NEXT: vmov.f32 s2, s16 +; CHECK-NEXT: vmov.f32 s1, s10 +; CHECK-NEXT: vmov.f32 s0, s8 +; CHECK-NEXT: vmovx.f16 s4, s8 +; CHECK-NEXT: vmovx.f16 s6, s9 +; CHECK-NEXT: vins.f16 s2, s17 +; CHECK-NEXT: vmovx.f16 s8, s17 +; CHECK-NEXT: vmovx.f16 s10, s15 +; CHECK-NEXT: vmovx.f16 s17, s14 +; CHECK-NEXT: vmov.f32 s21, s14 +; CHECK-NEXT: vmov.f32 s20, s12 +; CHECK-NEXT: vins.f16 s4, s6 +; CHECK-NEXT: vmovx.f16 s6, s16 +; CHECK-NEXT: vins.f16 s17, s10 +; CHECK-NEXT: vins.f16 s21, s15 +; CHECK-NEXT: vins.f16 s20, s13 +; CHECK-NEXT: vmovx.f16 s16, s12 +; CHECK-NEXT: vmovx.f16 s10, s13 +; CHECK-NEXT: vldrw.u32 q3, [r1, #48] +; CHECK-NEXT: vins.f16 s6, s8 +; CHECK-NEXT: vmovx.f16 s7, s18 +; CHECK-NEXT: vmovx.f16 s8, s19 +; CHECK-NEXT: vmov.f32 s22, s12 +; CHECK-NEXT: vmov.f32 s23, s14 +; CHECK-NEXT: vmov.f32 s3, s18 +; CHECK-NEXT: vins.f16 s16, s10 +; CHECK-NEXT: vmovx.f16 s18, s12 +; CHECK-NEXT: vmovx.f16 s10, s13 +; CHECK-NEXT: vins.f16 s22, s13 +; CHECK-NEXT: vins.f16 s23, s15 +; CHECK-NEXT: vins.f16 s7, s8 +; CHECK-NEXT: vins.f16 s3, s19 +; CHECK-NEXT: vins.f16 s18, s10 +; CHECK-NEXT: vmovx.f16 s19, s14 +; CHECK-NEXT: vmovx.f16 s10, s15 +; CHECK-NEXT: vsub.f16 q1, q5, q1 +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: vins.f16 s1, s11 +; CHECK-NEXT: vins.f16 s0, s9 +; CHECK-NEXT: vins.f16 s19, s10 +; CHECK-NEXT: vadd.f16 q2, q4, q0 +; CHECK-NEXT: vmovx.f16 s0, s23 +; CHECK-NEXT: vmovx.f16 s1, s22 +; CHECK-NEXT: vmov.f32 s13, s22 +; CHECK-NEXT: vmov.f32 s12, s20 +; CHECK-NEXT: vins.f16 s1, s0 +; CHECK-NEXT: vins.f16 s13, s23 +; CHECK-NEXT: vins.f16 s12, s21 +; CHECK-NEXT: vmovx.f16 s0, s20 +; CHECK-NEXT: vmovx.f16 s2, s21 +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vmov.f32 s15, s18 +; CHECK-NEXT: vmovx.f16 s3, s18 +; CHECK-NEXT: vmovx.f16 s18, s23 +; CHECK-NEXT: vmovx.f16 s25, s22 +; CHECK-NEXT: vmov.f32 s29, s22 +; CHECK-NEXT: vmov.f32 s28, s20 +; CHECK-NEXT: vins.f16 s25, s18 +; CHECK-NEXT: vins.f16 s29, s23 +; CHECK-NEXT: vins.f16 s28, s21 +; CHECK-NEXT: vmovx.f16 s24, s20 +; CHECK-NEXT: vmovx.f16 s18, s21 +; CHECK-NEXT: vldrw.u32 q5, [r1, #16] +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: vmovx.f16 s2, s16 +; CHECK-NEXT: vmovx.f16 s14, s17 +; CHECK-NEXT: vins.f16 s24, s18 +; CHECK-NEXT: vmovx.f16 s26, s20 +; CHECK-NEXT: vmovx.f16 s18, s21 +; CHECK-NEXT: vins.f16 s2, s14 +; CHECK-NEXT: vmov.f32 s14, s16 +; CHECK-NEXT: vmovx.f16 s16, s19 +; CHECK-NEXT: vins.f16 s26, s18 +; CHECK-NEXT: vmov.f32 s30, s20 +; CHECK-NEXT: vmov.f32 s31, s22 +; CHECK-NEXT: vmovx.f16 s27, s22 +; CHECK-NEXT: vmovx.f16 s18, s23 +; CHECK-NEXT: vins.f16 s14, s17 +; CHECK-NEXT: vins.f16 s15, s19 +; CHECK-NEXT: vins.f16 s30, s21 +; CHECK-NEXT: vins.f16 s31, s23 +; CHECK-NEXT: vins.f16 s3, s16 +; CHECK-NEXT: vins.f16 s27, s18 +; CHECK-NEXT: vsub.f16 q4, q7, q0 +; CHECK-NEXT: vadd.f16 q5, q6, q3 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: vst20.16 {q4, q5}, [r2] +; CHECK-NEXT: vst21.16 {q4, q5}, [r3]! +; CHECK-NEXT: vst20.16 {q1, q2}, [r3] +; CHECK-NEXT: vst21.16 {q1, q2}, [r3] +; CHECK-NEXT: b .LBB4_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <32 x half>, <32 x half>* %a, align 4 + %strided.vec = shufflevector <32 x half> %wide.vec, <32 x half> zeroinitializer, <16 x i32> + %strided.vec21 = shufflevector <32 x half> %wide.vec, <32 x half> zeroinitializer, <16 x i32> + %wide.vec22 = load <32 x half>, <32 x half>* %b, align 4 + %strided.vec23 = shufflevector <32 x half> %wide.vec22, <32 x half> zeroinitializer, <16 x i32> + %strided.vec24 = shufflevector <32 x half> %wide.vec22, <32 x half> zeroinitializer, <16 x i32> + %0 = fsub fast <16 x half> %strided.vec23, %strided.vec21 + %1 = fadd fast <16 x half> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> + store <32 x half> %interleaved.vec, <32 x half>* %c, align 4 + br label %vector.body +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f16-mul.ll @@ -0,0 +1,242 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; STATS: "complex-arithmetic.NumComplexIntrinsics": 6 + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_mul_v2f16(<2 x half>* %a, <2 x half>* %b, <2 x half>* %c) #0 { +; CHECK-LABEL: complex_mul_v2f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr.w r12, [r0] +; CHECK-NEXT: ldr r3, [r1] +; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vmul.f16 s8, s6, s2 +; CHECK-NEXT: vmul.f16 s2, s4, s2 +; CHECK-NEXT: vfnms.f16 s8, s4, s0 +; CHECK-NEXT: vfma.f16 s2, s6, s0 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: str r3, [r2] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <2 x half>, <2 x half>* %a + %b.val = load <2 x half>, <2 x half>* %b + %strided.vec = shufflevector <2 x half> %a.val, <2 x half> poison, <1 x i32> + %strided.vec46 = shufflevector <2 x half> %a.val, <2 x half> poison, <1 x i32> + %strided.vec48 = shufflevector <2 x half> %b.val, <2 x half> poison, <1 x i32> + %strided.vec49 = shufflevector <2 x half> %b.val, <2 x half> poison, <1 x i32> + %0 = fmul fast <1 x half> %strided.vec48, %strided.vec + %1 = fmul fast <1 x half> %strided.vec49, %strided.vec46 + %2 = fsub fast <1 x half> %0, %1 + %3 = fmul fast <1 x half> %strided.vec49, %strided.vec + %4 = fmul fast <1 x half> %strided.vec48, %strided.vec46 + %5 = fadd fast <1 x half> %3, %4 + %interleaved.vec = shufflevector <1 x half> %2, <1 x half> %5, <2 x i32> + store <2 x half> %interleaved.vec, <2 x half>* %c, align 4 + br label %vector.body +} + +define void @complex_mul_v4f16(<4 x half>* %a, <4 x half>* %b, <4 x half>* %c) #0 { +; CHECK-LABEL: complex_mul_v4f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldrd r3, r12, [r0] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: ldrd r3, lr, [r1] +; CHECK-NEXT: vmov.32 q0[1], r12 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmov.32 q1[1], lr +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s16, s4 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vmul.f16 q3, q4, q2 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vneg.f16 q3, q3 +; CHECK-NEXT: vfma.f16 q3, q1, q0 +; CHECK-NEXT: vmul.f16 q1, q1, q2 +; CHECK-NEXT: vfma.f16 q1, q4, q0 +; CHECK-NEXT: vmovx.f16 s13, s12 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vins.f16 s13, s0 +; CHECK-NEXT: vmov r3, r12, d6 +; CHECK-NEXT: strd r3, r12, [r2] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <4 x half>, <4 x half>* %a + %b.val = load <4 x half>, <4 x half>* %b + %strided.vec = shufflevector <4 x half> %a.val, <4 x half> poison, <2 x i32> + %strided.vec46 = shufflevector <4 x half> %a.val, <4 x half> poison, <2 x i32> + %strided.vec48 = shufflevector <4 x half> %b.val, <4 x half> poison, <2 x i32> + %strided.vec49 = shufflevector <4 x half> %b.val, <4 x half> poison, <2 x i32> + %0 = fmul fast <2 x half> %strided.vec48, %strided.vec + %1 = fmul fast <2 x half> %strided.vec49, %strided.vec46 + %2 = fsub fast <2 x half> %0, %1 + %3 = fmul fast <2 x half> %strided.vec49, %strided.vec + %4 = fmul fast <2 x half> %strided.vec48, %strided.vec46 + %5 = fadd fast <2 x half> %3, %4 + %interleaved.vec = shufflevector <2 x half> %2, <2 x half> %5, <4 x i32> + store <4 x half> %interleaved.vec, <4 x half>* %c, align 4 + br label %vector.body +} + +define void @complex_mul_v8f16(<8 x half>* %a, <8 x half>* %b, <8 x half>* %c) #0 { +; CHECK-LABEL: complex_mul_v8f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vcmul.f16 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f16 q2, q0, q1, #90 +; CHECK-NEXT: vstrw.32 q2, [r2] +; CHECK-NEXT: b .LBB2_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <8 x half>, <8 x half>* %a + %b.val = load <8 x half>, <8 x half>* %b + %strided.vec = shufflevector <8 x half> %a.val, <8 x half> poison, <4 x i32> + %strided.vec46 = shufflevector <8 x half> %a.val, <8 x half> poison, <4 x i32> + %strided.vec48 = shufflevector <8 x half> %b.val, <8 x half> poison, <4 x i32> + %strided.vec49 = shufflevector <8 x half> %b.val, <8 x half> poison, <4 x i32> + %0 = fmul fast <4 x half> %strided.vec48, %strided.vec + %1 = fmul fast <4 x half> %strided.vec49, %strided.vec46 + %2 = fsub fast <4 x half> %0, %1 + %3 = fmul fast <4 x half> %strided.vec49, %strided.vec + %4 = fmul fast <4 x half> %strided.vec48, %strided.vec46 + %5 = fadd fast <4 x half> %3, %4 + %interleaved.vec = shufflevector <4 x half> %2, <4 x half> %5, <8 x i32> + store <8 x half> %interleaved.vec, <8 x half>* %c, align 4 + br label %vector.body +} + +define void @complex_mul_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x half>* %c) #0 { +; CHECK-LABEL: complex_mul_v16f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r1, #16] +; CHECK-NEXT: vcmul.f16 q4, q1, q2, #0 +; CHECK-NEXT: vcmla.f16 q4, q1, q2, #90 +; CHECK-NEXT: vcmul.f16 q1, q0, q3, #0 +; CHECK-NEXT: vcmla.f16 q1, q0, q3, #90 +; CHECK-NEXT: vstrw.32 q1, [r2, #16] +; CHECK-NEXT: vstrw.32 q4, [r2] +; CHECK-NEXT: b .LBB3_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <16 x half>, <16 x half>* %a + %b.val = load <16 x half>, <16 x half>* %b + %strided.vec = shufflevector <16 x half> %a.val, <16 x half> poison, <8 x i32> + %strided.vec46 = shufflevector <16 x half> %a.val, <16 x half> poison, <8 x i32> + %strided.vec48 = shufflevector <16 x half> %b.val, <16 x half> poison, <8 x i32> + %strided.vec49 = shufflevector <16 x half> %b.val, <16 x half> poison, <8 x i32> + %0 = fmul fast <8 x half> %strided.vec48, %strided.vec + %1 = fmul fast <8 x half> %strided.vec49, %strided.vec46 + %2 = fsub fast <8 x half> %0, %1 + %3 = fmul fast <8 x half> %strided.vec49, %strided.vec + %4 = fmul fast <8 x half> %strided.vec48, %strided.vec46 + %5 = fadd fast <8 x half> %3, %4 + %interleaved.vec = shufflevector <8 x half> %2, <8 x half> %5, <16 x i32> + store <16 x half> %interleaved.vec, <16 x half>* %c, align 4 + br label %vector.body +} + +define void @complex_mul_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x half>* %c) #0 { +; CHECK-LABEL: complex_mul_v32f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vld20.16 {q2, q3}, [r1] +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: vld20.16 {q0, q1}, [r0] +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: vld21.16 {q2, q3}, [r3]! +; CHECK-NEXT: vld21.16 {q0, q1}, [r12]! +; CHECK-NEXT: vld20.16 {q4, q5}, [r3] +; CHECK-NEXT: vld20.16 {q6, q7}, [r12] +; CHECK-NEXT: vld21.16 {q4, q5}, [r3] +; CHECK-NEXT: vld21.16 {q6, q7}, [r12] +; CHECK-NEXT: vstmia sp, {d4, d5, d6, d7} @ 32-byte Spill +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmul.f16 q4, q2, q7 +; CHECK-NEXT: vneg.f16 q4, q4 +; CHECK-NEXT: vfma.f16 q4, q5, q6 +; CHECK-NEXT: vmul.f16 q5, q5, q7 +; CHECK-NEXT: vfma.f16 q5, q2, q6 +; CHECK-NEXT: vldmia sp, {d4, d5, d6, d7} @ 32-byte Reload +; CHECK-NEXT: vmul.f16 q6, q3, q1 +; CHECK-NEXT: vneg.f16 q6, q6 +; CHECK-NEXT: vmul.f16 q7, q2, q1 +; CHECK-NEXT: vfma.f16 q6, q2, q0 +; CHECK-NEXT: vfma.f16 q7, q3, q0 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: vst20.16 {q6, q7}, [r2] +; CHECK-NEXT: vst21.16 {q6, q7}, [r3]! +; CHECK-NEXT: vst20.16 {q4, q5}, [r3] +; CHECK-NEXT: vst21.16 {q4, q5}, [r3] +; CHECK-NEXT: b .LBB4_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <32 x half>, <32 x half>* %a + %b.val = load <32 x half>, <32 x half>* %b + %strided.vec = shufflevector <32 x half> %a.val, <32 x half> poison, <16 x i32> + %strided.vec46 = shufflevector <32 x half> %a.val, <32 x half> poison, <16 x i32> + %strided.vec48 = shufflevector <32 x half> %b.val, <32 x half> poison, <16 x i32> + %strided.vec49 = shufflevector <32 x half> %b.val, <32 x half> poison, <16 x i32> + %0 = fmul fast <16 x half> %strided.vec48, %strided.vec + %1 = fmul fast <16 x half> %strided.vec49, %strided.vec46 + %2 = fsub fast <16 x half> %0, %1 + %3 = fmul fast <16 x half> %strided.vec49, %strided.vec + %4 = fmul fast <16 x half> %strided.vec48, %strided.vec46 + %5 = fadd fast <16 x half> %3, %4 + %interleaved.vec = shufflevector <16 x half> %2, <16 x half> %5, <32 x i32> + store <32 x half> %interleaved.vec, <32 x half>* %c, align 4 + br label %vector.body +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-add.ll @@ -0,0 +1,177 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; STATS: "complex-arithmetic.NumComplexIntrinsics": 3 + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_add_v2f32(<2 x float>* %a, <2 x float>* %b, <2 x float>* %c) #0 { +; CHECK-LABEL: complex_add_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldr s2, [r0, #4] +; CHECK-NEXT: vldr s4, [r1] +; CHECK-NEXT: vldr s6, [r1, #4] +; CHECK-NEXT: vsub.f32 s2, s4, s2 +; CHECK-NEXT: vadd.f32 s0, s6, s0 +; CHECK-NEXT: vstr s2, [r2] +; CHECK-NEXT: vstr s0, [r2, #4] +; CHECK-NEXT: b .LBB0_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <2 x float>, <2 x float>* %a, align 4 + %strided.vec = shufflevector <2 x float> %wide.vec, <2 x float> zeroinitializer, <1 x i32> + %strided.vec21 = shufflevector <2 x float> %wide.vec, <2 x float> zeroinitializer, <1 x i32> + %wide.vec22 = load <2 x float>, <2 x float>* %b, align 4 + %strided.vec23 = shufflevector <2 x float> %wide.vec22, <2 x float> zeroinitializer, <1 x i32> + %strided.vec24 = shufflevector <2 x float> %wide.vec22, <2 x float> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x float> %strided.vec23, %strided.vec21 + %1 = fadd fast <1 x float> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> + store <2 x float> %interleaved.vec, <2 x float>* %c, align 4 + br label %vector.body +} + +define void @complex_add_v4f32(<4 x float>* %a, <4 x float>* %b, <4 x float>* %c) #0 { +; CHECK-LABEL: complex_add_v4f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vcadd.f32 q2, q1, q0, #90 +; CHECK-NEXT: vstrw.32 q2, [r2] +; CHECK-NEXT: b .LBB1_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <4 x float>, <4 x float>* %a, align 4 + %strided.vec = shufflevector <4 x float> %wide.vec, <4 x float> zeroinitializer, <2 x i32> + %strided.vec21 = shufflevector <4 x float> %wide.vec, <4 x float> zeroinitializer, <2 x i32> + %wide.vec22 = load <4 x float>, <4 x float>* %b, align 4 + %strided.vec23 = shufflevector <4 x float> %wide.vec22, <4 x float> zeroinitializer, <2 x i32> + %strided.vec24 = shufflevector <4 x float> %wide.vec22, <4 x float> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x float> %strided.vec23, %strided.vec21 + %1 = fadd fast <2 x float> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + store <4 x float> %interleaved.vec, <4 x float>* %c, align 4 + br label %vector.body +} + +define void @complex_add_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 { +; CHECK-LABEL: complex_add_v8f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q3, [r1, #16] +; CHECK-NEXT: vcadd.f32 q4, q2, q1, #90 +; CHECK-NEXT: vcadd.f32 q1, q3, q0, #90 +; CHECK-NEXT: vstrw.32 q1, [r2, #16] +; CHECK-NEXT: vstrw.32 q4, [r2] +; CHECK-NEXT: b .LBB2_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <8 x float>, <8 x float>* %a, align 4 + %strided.vec = shufflevector <8 x float> %wide.vec, <8 x float> zeroinitializer, <4 x i32> + %strided.vec21 = shufflevector <8 x float> %wide.vec, <8 x float> zeroinitializer, <4 x i32> + %wide.vec22 = load <8 x float>, <8 x float>* %b, align 4 + %strided.vec23 = shufflevector <8 x float> %wide.vec22, <8 x float> zeroinitializer, <4 x i32> + %strided.vec24 = shufflevector <8 x float> %wide.vec22, <8 x float> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x float> %strided.vec23, %strided.vec21 + %1 = fadd fast <4 x float> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %c, align 4 + br label %vector.body +} + +define void @complex_add_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0 { +; CHECK-LABEL: complex_add_v16f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r1, #32] +; CHECK-NEXT: vldrw.u32 q4, [r1, #48] +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s1, s6 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s8, s12 +; CHECK-NEXT: vmov.f32 s9, s14 +; CHECK-NEXT: vmov.f32 s12, s13 +; CHECK-NEXT: vmov.f32 s13, s15 +; CHECK-NEXT: vmov.f32 s10, s16 +; CHECK-NEXT: vmov.f32 s11, s18 +; CHECK-NEXT: vmov.f32 s14, s17 +; CHECK-NEXT: vmov.f32 s15, s19 +; CHECK-NEXT: vsub.f32 q1, q2, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vadd.f32 q2, q3, q0 +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: vldrw.u32 q6, [r1, #16] +; CHECK-NEXT: vmov.f32 s0, s16 +; CHECK-NEXT: vmov.f32 s1, s18 +; CHECK-NEXT: vmov.f32 s16, s17 +; CHECK-NEXT: vmov.f32 s17, s19 +; CHECK-NEXT: vmov.f32 s2, s12 +; CHECK-NEXT: vmov.f32 s3, s14 +; CHECK-NEXT: vmov.f32 s18, s13 +; CHECK-NEXT: vmov.f32 s19, s15 +; CHECK-NEXT: vmov.f32 s12, s20 +; CHECK-NEXT: vmov.f32 s13, s22 +; CHECK-NEXT: vmov.f32 s20, s21 +; CHECK-NEXT: vmov.f32 s21, s23 +; CHECK-NEXT: vmov.f32 s14, s24 +; CHECK-NEXT: vmov.f32 s15, s26 +; CHECK-NEXT: vmov.f32 s22, s25 +; CHECK-NEXT: vmov.f32 s23, s27 +; CHECK-NEXT: vsub.f32 q3, q3, q4 +; CHECK-NEXT: vadd.f32 q4, q5, q0 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: vst20.32 {q3, q4}, [r2] +; CHECK-NEXT: vst21.32 {q3, q4}, [r3]! +; CHECK-NEXT: vst20.32 {q1, q2}, [r3] +; CHECK-NEXT: vst21.32 {q1, q2}, [r3] +; CHECK-NEXT: b .LBB3_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <16 x float>, <16 x float>* %a, align 4 + %strided.vec = shufflevector <16 x float> %wide.vec, <16 x float> zeroinitializer, <8 x i32> + %strided.vec21 = shufflevector <16 x float> %wide.vec, <16 x float> zeroinitializer, <8 x i32> + %wide.vec22 = load <16 x float>, <16 x float>* %b, align 4 + %strided.vec23 = shufflevector <16 x float> %wide.vec22, <16 x float> zeroinitializer, <8 x i32> + %strided.vec24 = shufflevector <16 x float> %wide.vec22, <16 x float> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x float> %strided.vec23, %strided.vec21 + %1 = fadd fast <8 x float> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> + store <16 x float> %interleaved.vec, <16 x float>* %c, align 4 + br label %vector.body +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f32-mul.ll @@ -0,0 +1,182 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; STATS: "complex-arithmetic.NumComplexIntrinsics": 6 + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_mul_v2f32(<2 x float>* %a, <2 x float>* %b, <2 x float>* %c) #0 { +; CHECK-LABEL: complex_mul_v2f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldr s2, [r0, #4] +; CHECK-NEXT: vldr s4, [r1, #4] +; CHECK-NEXT: vldr s6, [r1] +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vmul.f32 s8, s4, s2 +; CHECK-NEXT: vmul.f32 s2, s6, s2 +; CHECK-NEXT: vfnms.f32 s8, s6, s0 +; CHECK-NEXT: vfma.f32 s2, s4, s0 +; CHECK-NEXT: vstr s8, [r2] +; CHECK-NEXT: vstr s2, [r2, #4] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <2 x float>, <2 x float>* %a + %b.val = load <2 x float>, <2 x float>* %b + %strided.vec = shufflevector <2 x float> %a.val, <2 x float> poison, <1 x i32> + %strided.vec46 = shufflevector <2 x float> %a.val, <2 x float> poison, <1 x i32> + %strided.vec48 = shufflevector <2 x float> %b.val, <2 x float> poison, <1 x i32> + %strided.vec49 = shufflevector <2 x float> %b.val, <2 x float> poison, <1 x i32> + %0 = fmul fast <1 x float> %strided.vec48, %strided.vec + %1 = fmul fast <1 x float> %strided.vec49, %strided.vec46 + %2 = fsub fast <1 x float> %0, %1 + %3 = fmul fast <1 x float> %strided.vec49, %strided.vec + %4 = fmul fast <1 x float> %strided.vec48, %strided.vec46 + %5 = fadd fast <1 x float> %3, %4 + %interleaved.vec = shufflevector <1 x float> %2, <1 x float> %5, <2 x i32> + store <2 x float> %interleaved.vec, <2 x float>* %c, align 4 + br label %vector.body +} + +define void @complex_mul_v4f32(<4 x float>* %a, <4 x float>* %b, <4 x float>* %c) #0 { +; CHECK-LABEL: complex_mul_v4f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 +; CHECK-NEXT: vstrw.32 q2, [r2] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <4 x float>, <4 x float>* %a + %b.val = load <4 x float>, <4 x float>* %b + %strided.vec = shufflevector <4 x float> %a.val, <4 x float> poison, <2 x i32> + %strided.vec46 = shufflevector <4 x float> %a.val, <4 x float> poison, <2 x i32> + %strided.vec48 = shufflevector <4 x float> %b.val, <4 x float> poison, <2 x i32> + %strided.vec49 = shufflevector <4 x float> %b.val, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec48, %strided.vec + %1 = fmul fast <2 x float> %strided.vec49, %strided.vec46 + %2 = fsub fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %strided.vec49, %strided.vec + %4 = fmul fast <2 x float> %strided.vec48, %strided.vec46 + %5 = fadd fast <2 x float> %3, %4 + %interleaved.vec = shufflevector <2 x float> %2, <2 x float> %5, <4 x i32> + store <4 x float> %interleaved.vec, <4 x float>* %c, align 4 + br label %vector.body +} + +define void @complex_mul_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) #0 { +; CHECK-LABEL: complex_mul_v8f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r1, #16] +; CHECK-NEXT: vcmul.f32 q4, q1, q2, #0 +; CHECK-NEXT: vcmla.f32 q4, q1, q2, #90 +; CHECK-NEXT: vcmul.f32 q1, q0, q3, #0 +; CHECK-NEXT: vcmla.f32 q1, q0, q3, #90 +; CHECK-NEXT: vstrw.32 q1, [r2, #16] +; CHECK-NEXT: vstrw.32 q4, [r2] +; CHECK-NEXT: b .LBB2_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <8 x float>, <8 x float>* %a + %b.val = load <8 x float>, <8 x float>* %b + %strided.vec = shufflevector <8 x float> %a.val, <8 x float> poison, <4 x i32> + %strided.vec46 = shufflevector <8 x float> %a.val, <8 x float> poison, <4 x i32> + %strided.vec48 = shufflevector <8 x float> %b.val, <8 x float> poison, <4 x i32> + %strided.vec49 = shufflevector <8 x float> %b.val, <8 x float> poison, <4 x i32> + %0 = fmul fast <4 x float> %strided.vec48, %strided.vec + %1 = fmul fast <4 x float> %strided.vec49, %strided.vec46 + %2 = fsub fast <4 x float> %0, %1 + %3 = fmul fast <4 x float> %strided.vec49, %strided.vec + %4 = fmul fast <4 x float> %strided.vec48, %strided.vec46 + %5 = fadd fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %2, <4 x float> %5, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %c, align 4 + br label %vector.body +} + +define void @complex_mul_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x float>* %c) #0 { +; CHECK-LABEL: complex_mul_v16f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vld20.32 {q2, q3}, [r1] +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: vld21.32 {q2, q3}, [r3]! +; CHECK-NEXT: vld21.32 {q0, q1}, [r12]! +; CHECK-NEXT: vld20.32 {q4, q5}, [r3] +; CHECK-NEXT: vld20.32 {q6, q7}, [r12] +; CHECK-NEXT: vld21.32 {q4, q5}, [r3] +; CHECK-NEXT: vld21.32 {q6, q7}, [r12] +; CHECK-NEXT: vstmia sp, {d4, d5, d6, d7} @ 32-byte Spill +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmul.f32 q4, q2, q7 +; CHECK-NEXT: vneg.f32 q4, q4 +; CHECK-NEXT: vfma.f32 q4, q5, q6 +; CHECK-NEXT: vmul.f32 q5, q5, q7 +; CHECK-NEXT: vfma.f32 q5, q2, q6 +; CHECK-NEXT: vldmia sp, {d4, d5, d6, d7} @ 32-byte Reload +; CHECK-NEXT: vmul.f32 q6, q3, q1 +; CHECK-NEXT: vneg.f32 q6, q6 +; CHECK-NEXT: vmul.f32 q7, q2, q1 +; CHECK-NEXT: vfma.f32 q6, q2, q0 +; CHECK-NEXT: vfma.f32 q7, q3, q0 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: vst20.32 {q6, q7}, [r2] +; CHECK-NEXT: vst21.32 {q6, q7}, [r3]! +; CHECK-NEXT: vst20.32 {q4, q5}, [r3] +; CHECK-NEXT: vst21.32 {q4, q5}, [r3] +; CHECK-NEXT: b .LBB3_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <16 x float>, <16 x float>* %a + %b.val = load <16 x float>, <16 x float>* %b + %strided.vec = shufflevector <16 x float> %a.val, <16 x float> poison, <8 x i32> + %strided.vec46 = shufflevector <16 x float> %a.val, <16 x float> poison, <8 x i32> + %strided.vec48 = shufflevector <16 x float> %b.val, <16 x float> poison, <8 x i32> + %strided.vec49 = shufflevector <16 x float> %b.val, <16 x float> poison, <8 x i32> + %0 = fmul fast <8 x float> %strided.vec48, %strided.vec + %1 = fmul fast <8 x float> %strided.vec49, %strided.vec46 + %2 = fsub fast <8 x float> %0, %1 + %3 = fmul fast <8 x float> %strided.vec49, %strided.vec + %4 = fmul fast <8 x float> %strided.vec48, %strided.vec46 + %5 = fadd fast <8 x float> %3, %4 + %interleaved.vec = shufflevector <8 x float> %2, <8 x float> %5, <16 x i32> + store <16 x float> %interleaved.vec, <16 x float>* %c, align 4 + br label %vector.body +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-add.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; STATS-NOT: "complex-arithmetic.NumComplexIntrinsics" + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_add_v2f64(<2 x double>* %a, <2 x double>* %b, <2 x double>* %c) #0 { +; CHECK-LABEL: complex_add_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vsub.f64 d2, d2, d1 +; CHECK-NEXT: vadd.f64 d3, d3, d0 +; CHECK-NEXT: vstrw.32 q1, [r2] +; CHECK-NEXT: b .LBB0_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <2 x double>, <2 x double>* %a, align 4 + %strided.vec = shufflevector <2 x double> %wide.vec, <2 x double> zeroinitializer, <1 x i32> + %strided.vec21 = shufflevector <2 x double> %wide.vec, <2 x double> zeroinitializer, <1 x i32> + %wide.vec22 = load <2 x double>, <2 x double>* %b, align 4 + %strided.vec23 = shufflevector <2 x double> %wide.vec22, <2 x double> zeroinitializer, <1 x i32> + %strided.vec24 = shufflevector <2 x double> %wide.vec22, <2 x double> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x double> %strided.vec23, %strided.vec21 + %1 = fadd fast <1 x double> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> + store <2 x double> %interleaved.vec, <2 x double>* %c, align 4 + br label %vector.body +} + +define void @complex_add_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0 { +; CHECK-LABEL: complex_add_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q3, [r1, #16] +; CHECK-NEXT: vsub.f64 d4, d4, d1 +; CHECK-NEXT: vadd.f64 d5, d5, d0 +; CHECK-NEXT: vsub.f64 d0, d6, d3 +; CHECK-NEXT: vadd.f64 d1, d7, d2 +; CHECK-NEXT: vstrw.32 q0, [r2, #16] +; CHECK-NEXT: vstrw.32 q2, [r2] +; CHECK-NEXT: b .LBB1_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <4 x double>, <4 x double>* %a, align 4 + %strided.vec = shufflevector <4 x double> %wide.vec, <4 x double> zeroinitializer, <2 x i32> + %strided.vec21 = shufflevector <4 x double> %wide.vec, <4 x double> zeroinitializer, <2 x i32> + %wide.vec22 = load <4 x double>, <4 x double>* %b, align 4 + %strided.vec23 = shufflevector <4 x double> %wide.vec22, <4 x double> zeroinitializer, <2 x i32> + %strided.vec24 = shufflevector <4 x double> %wide.vec22, <4 x double> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x double> %strided.vec23, %strided.vec21 + %1 = fadd fast <2 x double> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> + store <4 x double> %interleaved.vec, <4 x double>* %c, align 4 + br label %vector.body +} + +define void @complex_add_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0 { +; CHECK-LABEL: complex_add_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1, #16] +; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vsub.f64 d4, d4, d3 +; CHECK-NEXT: vadd.f64 d5, d5, d2 +; CHECK-NEXT: vsub.f64 d2, d6, d1 +; CHECK-NEXT: vadd.f64 d3, d7, d0 +; CHECK-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r1, #48] +; CHECK-NEXT: vldrw.u32 q5, [r1, #32] +; CHECK-NEXT: vsub.f64 d8, d8, d1 +; CHECK-NEXT: vadd.f64 d9, d9, d0 +; CHECK-NEXT: vsub.f64 d0, d10, d7 +; CHECK-NEXT: vadd.f64 d1, d11, d6 +; CHECK-NEXT: vstrw.32 q0, [r2, #32] +; CHECK-NEXT: vstrw.32 q4, [r2, #48] +; CHECK-NEXT: vstrw.32 q1, [r2] +; CHECK-NEXT: vstrw.32 q2, [r2, #16] +; CHECK-NEXT: b .LBB2_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %wide.vec = load <8 x double>, <8 x double>* %a, align 4 + %strided.vec = shufflevector <8 x double> %wide.vec, <8 x double> zeroinitializer, <4 x i32> + %strided.vec21 = shufflevector <8 x double> %wide.vec, <8 x double> zeroinitializer, <4 x i32> + %wide.vec22 = load <8 x double>, <8 x double>* %b, align 4 + %strided.vec23 = shufflevector <8 x double> %wide.vec22, <8 x double> zeroinitializer, <4 x i32> + %strided.vec24 = shufflevector <8 x double> %wide.vec22, <8 x double> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x double> %strided.vec23, %strided.vec21 + %1 = fadd fast <4 x double> %strided.vec24, %strided.vec + %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> + store <8 x double> %interleaved.vec, <8 x double>* %c, align 4 + br label %vector.body +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-f64-mul.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; STATS-NOT: "complex-arithmetic.NumComplexIntrinsics" + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_mul_v2f64(<2 x double>* %a, <2 x double>* %b, <2 x double>* %c) #0 { +; CHECK-LABEL: complex_mul_v2f64: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmul.f64 d4, d3, d1 +; CHECK-NEXT: vmul.f64 d5, d2, d1 +; CHECK-NEXT: vfnms.f64 d4, d2, d0 +; CHECK-NEXT: vfma.f64 d5, d3, d0 +; CHECK-NEXT: vstrw.32 q2, [r2] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <2 x double>, <2 x double>* %a + %b.val = load <2 x double>, <2 x double>* %b + %strided.vec = shufflevector <2 x double> %a.val, <2 x double> poison, <1 x i32> + %strided.vec46 = shufflevector <2 x double> %a.val, <2 x double> poison, <1 x i32> + %strided.vec48 = shufflevector <2 x double> %b.val, <2 x double> poison, <1 x i32> + %strided.vec49 = shufflevector <2 x double> %b.val, <2 x double> poison, <1 x i32> + %0 = fmul fast <1 x double> %strided.vec48, %strided.vec + %1 = fmul fast <1 x double> %strided.vec49, %strided.vec46 + %2 = fsub fast <1 x double> %0, %1 + %3 = fmul fast <1 x double> %strided.vec49, %strided.vec + %4 = fmul fast <1 x double> %strided.vec48, %strided.vec46 + %5 = fadd fast <1 x double> %3, %4 + %interleaved.vec = shufflevector <1 x double> %2, <1 x double> %5, <2 x i32> + store <2 x double> %interleaved.vec, <2 x double>* %c, align 4 + br label %vector.body +} + +define void @complex_mul_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x double>* %c) #0 { +; CHECK-LABEL: complex_mul_v4f64: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vmul.f64 d4, d3, d1 +; CHECK-NEXT: vmul.f64 d5, d2, d1 +; CHECK-NEXT: vfnms.f64 d4, d2, d0 +; CHECK-NEXT: vfma.f64 d5, d3, d0 +; CHECK-NEXT: vmul.f64 d0, d9, d7 +; CHECK-NEXT: vmul.f64 d1, d8, d7 +; CHECK-NEXT: vfnms.f64 d0, d8, d6 +; CHECK-NEXT: vfma.f64 d1, d9, d6 +; CHECK-NEXT: vstrw.32 q2, [r2, #16] +; CHECK-NEXT: vstrw.32 q0, [r2] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <4 x double>, <4 x double>* %a + %b.val = load <4 x double>, <4 x double>* %b + %strided.vec = shufflevector <4 x double> %a.val, <4 x double> poison, <2 x i32> + %strided.vec46 = shufflevector <4 x double> %a.val, <4 x double> poison, <2 x i32> + %strided.vec48 = shufflevector <4 x double> %b.val, <4 x double> poison, <2 x i32> + %strided.vec49 = shufflevector <4 x double> %b.val, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec48, %strided.vec + %1 = fmul fast <2 x double> %strided.vec49, %strided.vec46 + %2 = fsub fast <2 x double> %0, %1 + %3 = fmul fast <2 x double> %strided.vec49, %strided.vec + %4 = fmul fast <2 x double> %strided.vec48, %strided.vec46 + %5 = fadd fast <2 x double> %3, %4 + %interleaved.vec = shufflevector <2 x double> %2, <2 x double> %5, <4 x i32> + store <4 x double> %interleaved.vec, <4 x double>* %c, align 4 + br label %vector.body +} + +define void @complex_mul_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x double>* %c) #0 { +; CHECK-LABEL: complex_mul_v8f64: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q2, [r1, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0, #48] +; CHECK-NEXT: vldrw.u32 q4, [r1, #48] +; CHECK-NEXT: vmul.f64 d2, d5, d1 +; CHECK-NEXT: vmul.f64 d3, d4, d1 +; CHECK-NEXT: vfnms.f64 d2, d4, d0 +; CHECK-NEXT: vfma.f64 d3, d5, d0 +; CHECK-NEXT: vmul.f64 d0, d9, d7 +; CHECK-NEXT: vmul.f64 d1, d8, d7 +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q5, [r1, #32] +; CHECK-NEXT: vfnms.f64 d0, d8, d6 +; CHECK-NEXT: vfma.f64 d1, d9, d6 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vmul.f64 d6, d11, d5 +; CHECK-NEXT: vmul.f64 d7, d10, d5 +; CHECK-NEXT: vfnms.f64 d6, d10, d4 +; CHECK-NEXT: vfma.f64 d7, d11, d4 +; CHECK-NEXT: vmul.f64 d4, d13, d9 +; CHECK-NEXT: vmul.f64 d5, d12, d9 +; CHECK-NEXT: vfnms.f64 d4, d12, d8 +; CHECK-NEXT: vfma.f64 d5, d13, d8 +; CHECK-NEXT: vstrw.32 q3, [r2, #32] +; CHECK-NEXT: vstrw.32 q0, [r2, #48] +; CHECK-NEXT: vstrw.32 q2, [r2] +; CHECK-NEXT: vstrw.32 q1, [r2, #16] +; CHECK-NEXT: b .LBB2_1 +vector.ph: + br label %vector.body + +vector.body: + %a.val = load <8 x double>, <8 x double>* %a + %b.val = load <8 x double>, <8 x double>* %b + %strided.vec = shufflevector <8 x double> %a.val, <8 x double> poison, <4 x i32> + %strided.vec46 = shufflevector <8 x double> %a.val, <8 x double> poison, <4 x i32> + %strided.vec48 = shufflevector <8 x double> %b.val, <8 x double> poison, <4 x i32> + %strided.vec49 = shufflevector <8 x double> %b.val, <8 x double> poison, <4 x i32> + %0 = fmul fast <4 x double> %strided.vec48, %strided.vec + %1 = fmul fast <4 x double> %strided.vec49, %strided.vec46 + %2 = fsub fast <4 x double> %0, %1 + %3 = fmul fast <4 x double> %strided.vec49, %strided.vec + %4 = fmul fast <4 x double> %strided.vec48, %strided.vec46 + %5 = fadd fast <4 x double> %3, %4 + %interleaved.vec = shufflevector <4 x double> %2, <4 x double> %5, <8 x i32> + store <8 x double> %interleaved.vec, <8 x double>* %c, align 4 + br label %vector.body +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-rotations-add.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-rotations-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-rotations-add.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_rotation_I() #0 { +; CHECK-LABEL: complex_rotation_I: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcadd.f32 q2, q0, q0, #90 +; CHECK-NEXT: vcadd.f32 q0, q1, q1, #90 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x float> %strided.vec41, %strided.vec39 + %1 = fadd fast <4 x float> %strided.vec42, %strided.vec + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} + +define void @complex_rotation_III() #0 { +; CHECK-LABEL: complex_rotation_III: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcadd.f32 q2, q0, q0, #270 +; CHECK-NEXT: vcadd.f32 q0, q1, q1, #270 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %0 = fadd fast <4 x float> %strided.vec42, %strided.vec + %1 = fsub fast <4 x float> %strided.vec39, %strided.vec41 + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} diff --git a/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-rotations-mul.ll b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-rotations-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/ComplexArithmetic/complex-arithmetic-rotations-mul.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_rotation() #0 { +; CHECK-LABEL: complex_rotation: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmul.f32 q2, q0, q0, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q0, #90 +; CHECK-NEXT: vcmul.f32 q0, q1, q1, #0 +; CHECK-NEXT: vcmla.f32 q0, q1, q1, #90 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %0 = fmul fast <4 x float> %strided.vec41, %strided.vec + %1 = fmul fast <4 x float> %strided.vec42, %strided.vec39 + %2 = fsub fast <4 x float> %0, %1 + %3 = fmul fast <4 x float> %strided.vec42, %strided.vec + %4 = fmul fast <4 x float> %strided.vec41, %strided.vec39 + %5 = fadd fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %2, <4 x float> %5, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} + +define void @complex_rotation_I() #0 { +; CHECK-LABEL: complex_rotation_I: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r1, #16 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s1, s6 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vmul.f32 q2, q1, q1 +; CHECK-NEXT: vneg.f32 q3, q2 +; CHECK-NEXT: vmul.f32 q2, q0, q1 +; CHECK-NEXT: vneg.f32 q2, q2 +; CHECK-NEXT: vfma.f32 q3, q0, q0 +; CHECK-NEXT: vfms.f32 q2, q0, q1 +; CHECK-NEXT: vst20.32 {q2, q3}, [r0] +; CHECK-NEXT: vst21.32 {q2, q3}, [r0] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %0 = fmul fast <4 x float> %strided.vec41, %strided.vec + %1 = fmul fast <4 x float> %strided.vec42, %strided.vec39 + %2 = fsub fast <4 x float> %0, %1 + %3 = fneg fast <4 x float> %strided.vec + %4 = fmul fast <4 x float> %strided.vec42, %3 + %5 = fmul fast <4 x float> %strided.vec41, %strided.vec39 + %6 = fsub fast <4 x float> %4, %5 + %interleaved.vec = shufflevector <4 x float> %6, <4 x float> %2, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} + + +define void @complex_rotation_II() #0 { +; CHECK-LABEL: complex_rotation_II: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmul.f32 q2, q0, q0, #180 +; CHECK-NEXT: vcmla.f32 q2, q0, q0, #270 +; CHECK-NEXT: vcmul.f32 q0, q1, q1, #180 +; CHECK-NEXT: vcmla.f32 q0, q1, q1, #270 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: b .LBB2_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39.neg = fneg fast <4 x float> %strided.vec39 + %0 = fmul fast <4 x float> %strided.vec41, %strided.vec39.neg + %1 = fmul fast <4 x float> %strided.vec42, %strided.vec + %2 = fsub fast <4 x float> %0, %1 + %3 = fmul fast <4 x float> %strided.vec42, %strided.vec39 + %4 = fmul fast <4 x float> %strided.vec41, %strided.vec + %5 = fsub fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} + +define void @complex_rotation_III() #0 { +; CHECK-LABEL: complex_rotation_III: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r1, #16 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s1, s6 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vmul.f32 q2, q1, q1 +; CHECK-NEXT: vneg.f32 q3, q2 +; CHECK-NEXT: vmul.f32 q2, q0, q1 +; CHECK-NEXT: vfma.f32 q3, q0, q0 +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vst20.32 {q2, q3}, [r0] +; CHECK-NEXT: vst21.32 {q2, q3}, [r0] +; CHECK-NEXT: b .LBB3_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %0 = fmul fast <4 x float> %strided.vec41, %strided.vec + %1 = fmul fast <4 x float> %strided.vec42, %strided.vec39 + %2 = fsub fast <4 x float> %0, %1 + %3 = fmul fast <4 x float> %strided.vec42, %strided.vec + %4 = fmul fast <4 x float> %strided.vec41, %strided.vec39 + %5 = fadd fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -46,6 +46,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Transform functions to use DSP intrinsics +; CHECK-NEXT: Complex Arithmetic Pass ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: Type Promotion ; CHECK-NEXT: CodeGen Prepare