diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -43,6 +43,7 @@ class DominatorTree; class BranchInst; class CallBase; +class ComplexArithmeticGraph; class Function; class GlobalValue; class InstCombiner; @@ -762,6 +763,13 @@ /// the scalarization cost of a load/store. bool supportsEfficientVectorElementLoadStore() const; + bool supportsComplexArithmetic() const; + + Value *createComplexArithmeticIR(ComplexArithmeticGraph &G, Value *InputA, + Value *InputB, int &GeneratedIntrinsicCount); + + bool matchComplexArithmeticIR(Instruction *I, ComplexArithmeticGraph &G); + /// Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; @@ -1582,6 +1590,12 @@ getOperandsScalarizationOverhead(ArrayRef Args, ArrayRef Tys) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; + virtual bool supportsComplexArithmetic() const = 0; + virtual Value *createComplexArithmeticIR(ComplexArithmeticGraph &G, + Value *InputA, Value *InputB, + int &GeneratedIntrinsicCount) = 0; + virtual bool matchComplexArithmeticIR(Instruction *I, + ComplexArithmeticGraph &G) = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0; @@ -2030,6 +2044,22 @@ return Impl.supportsEfficientVectorElementLoadStore(); } + bool supportsComplexArithmetic() const override { + return Impl.supportsComplexArithmetic(); + } + + Value *createComplexArithmeticIR(ComplexArithmeticGraph &G, Value *InputA, + Value *InputB, + int &GeneratedIntrinsicCount) override { + return Impl.createComplexArithmeticIR(G, InputA, InputB, + GeneratedIntrinsicCount); + } + + bool matchComplexArithmeticIR(Instruction *I, + ComplexArithmeticGraph &G) override { + return Impl.matchComplexArithmeticIR(I, G); + } + bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -334,6 +334,16 @@ bool supportsEfficientVectorElementLoadStore() const { return false; } + bool supportsComplexArithmetic() const { return false; } + Value *createComplexArithmeticIR(ComplexArithmeticGraph &, Value *, Value *, + int &GeneratedIntrinsicCount) { + return nullptr; + } + + bool matchComplexArithmeticIR(Instruction *I, ComplexArithmeticGraph &G) { + return false; + } + bool enableAggressiveInterleaving(bool LoopHasReductions) const { return false; } diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -114,9 +114,10 @@ void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); void initializeCheckDebugMachineModulePass(PassRegistry &); -void initializeCodeGenPreparePass(PassRegistry&); -void initializeConstantHoistingLegacyPassPass(PassRegistry&); -void initializeConstantMergeLegacyPassPass(PassRegistry&); +void initializeCodeGenPreparePass(PassRegistry &); +void initializeComplexArithmeticLegacyPassPass(PassRegistry &); +void initializeConstantHoistingLegacyPassPass(PassRegistry &); +void initializeConstantMergeLegacyPassPass(PassRegistry &); void initializeConstraintEliminationPass(PassRegistry &); void initializeControlHeightReductionLegacyPassPass(PassRegistry&); void initializeCorrelatedValuePropagationPass(PassRegistry&); diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -555,13 +555,20 @@ // FunctionPass *createInstSimplifyLegacyPass(); - //===----------------------------------------------------------------------===// // // createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather // and scatter intrinsics with scalar code when target doesn't support them. // FunctionPass *createScalarizeMaskedMemIntrinLegacyPass(); -} // End llvm namespace + +//===----------------------------------------------------------------------===// +// +// This pass implements generation of target-specific intrinsics to support +// handling of complex number arithmetic +// +FunctionPass *createComplexArithmeticPass(); + +} // namespace llvm #endif diff --git a/llvm/include/llvm/Transforms/Scalar/ComplexArithmetic.h b/llvm/include/llvm/Transforms/Scalar/ComplexArithmetic.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/ComplexArithmetic.h @@ -0,0 +1,252 @@ +//===- ComplexArithmetic.h - Complex Arithmetic Pass --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements generation of target-specific intrinsics to support +// handling of complex number arithmetic. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_COMPLEXARITHMETIC_H +#define LLVM_TRANSFORMS_SCALAR_COMPLEXARITHMETIC_H + +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" + +namespace llvm { + +class Function; + +struct ComplexArithmeticPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/** + * Annotated graph-like structure that enriches the existing Instruction graph, + * allowing for contextual clues relevant to complex arithmetic to be provided + * and given to TTI hooks as required. + */ +class ComplexArithmeticGraph { +public: + /** + * Bitflags denoting the type of the instruction node. + */ + enum NodeType { + Unknown = 0, + // Actual node types + + Real = 1, + Imaginary = 2, + Load = 4, + Store = 8, + Shuffle = 16, + AddOperand = 32, + + // Meta node types, defining additional behaviour upon node creation + + /** + * Will cause the node to look at parents to try and identify the type. + * Parents must already be registered and identified. + */ + Discover = 0xffff, + }; + + enum GraphType { Complex_Mul, Complex_Add }; + + struct Node { + public: + Instruction *I; + NodeType NType; + + Node(Instruction *i, enum NodeType nodeType) : I(i), NType(nodeType) {} + }; + + /** + * Returns a copy of the vector of all registered nodes. + */ + SmallVector getAllNodes() { + SmallVector Is; + for (auto &N : Nodes) + Is.push_back(N->I); + return Is; + } + + /** + * Returns a vector of all registered nodes that are of the given type. + */ + SmallVector getNodesOfType(enum NodeType Type) { + SmallVector Is; + for (auto &N : Nodes) { + if ((N->NType & Type) == Type) + Is.push_back(N->I); + } + return Is; + } + + /** + * Returns the node type of I. It must already be registered and identified, + * otherwise `Unknown` is returned. + */ + enum NodeType getNodeType(Instruction *I) { + auto *N = getNode(I); + if (N == nullptr) + return Unknown; + return N->NType; + } + + /** + * Registers and identifies the given Instruction, optionally with the + * provided NodeType. + */ + void addNode(Instruction *I, enum NodeType NodeType = NodeType::Unknown) { + if ((NodeType & Discover) == Discover) { + auto LeftType = getNodeType(cast(I->getOperand(0))); + auto RightType = getNodeType(cast(I->getOperand(1))); + + if (LeftType == Unknown || RightType == Unknown) { + NodeType = Unknown; + } else { + if (I->getOpcode() == Instruction::FMul) { + if (LeftType == RightType) + NodeType = Real; + else + NodeType = Imaginary; + } else { + NodeType = LeftType; + } + } + } + + auto *Existing = getNode(I); + if (Existing != nullptr) { + if (Existing->NType == NodeType) + return; + llvm_unreachable( + "A node has been added twice, with conflicting nodetypes."); + } + + auto N = std::make_unique(I, NodeType); + Nodes.push_back(std::move(N)); + } + + LLVMContext &getContext() { return CurrentI->getContext(); } + + Instruction *getCurrentInstruction() { return CurrentI; } + + void setCurrentInstruction(Instruction *I) { CurrentI = I; } + + void setType(enum GraphType type) { GType = type; } + + enum GraphType getType() { return GType; } + + void setRotation(unsigned R) { Rotation = R; } + + unsigned getRotation() { return Rotation; } + + /** + * Sets the graph userdata pointer. The graph then assumes ownership of the + * pointer, and will free it on deconstruction. + */ + template void setUserData(T *Ptr) { + UserData = std::shared_ptr(Ptr); + } + + /** + * Gets the graph userdata pointer, casting it to T. + * + * Note: No checks are made by the graph to ensure the type of the data is as + * requested. It is up to the caller to check for that. + */ + template T *getUserData() { + if (UserData == nullptr) + return nullptr; + auto *Ptr = UserData.get(); + return (T *)Ptr; + } + +private: + unsigned Rotation; + GraphType GType; + Instruction *CurrentI = nullptr; + // std::unique_ptr doesn't support void* without an explicit deleter + std::shared_ptr UserData; + + Node *getNode(Instruction *I) { + for (const auto &item : Nodes) { + if (item->I == I) + return item.get(); + } + return nullptr; + } + + SmallVector> Nodes; +}; + +inline ComplexArithmeticGraph::NodeType +operator~(ComplexArithmeticGraph::NodeType a) { + return (ComplexArithmeticGraph::NodeType) ~(int)a; +} +inline ComplexArithmeticGraph::NodeType +operator|(ComplexArithmeticGraph::NodeType a, + ComplexArithmeticGraph::NodeType b) { + return (ComplexArithmeticGraph::NodeType)((int)a | (int)b); +} +inline ComplexArithmeticGraph::NodeType +operator&(ComplexArithmeticGraph::NodeType a, + ComplexArithmeticGraph::NodeType b) { + return (ComplexArithmeticGraph::NodeType)((int)a & (int)b); +} +inline ComplexArithmeticGraph::NodeType +operator^(ComplexArithmeticGraph::NodeType a, + ComplexArithmeticGraph::NodeType b) { + return (ComplexArithmeticGraph::NodeType)((int)a ^ (int)b); +} +inline ComplexArithmeticGraph::NodeType & +operator|=(ComplexArithmeticGraph::NodeType &a, + ComplexArithmeticGraph::NodeType b) { + return (ComplexArithmeticGraph::NodeType &)((int &)a |= (int)b); +} +inline ComplexArithmeticGraph::NodeType & +operator&=(ComplexArithmeticGraph::NodeType &a, + ComplexArithmeticGraph::NodeType b) { + return (ComplexArithmeticGraph::NodeType &)((int &)a &= (int)b); +} +inline ComplexArithmeticGraph::NodeType & +operator^=(ComplexArithmeticGraph::NodeType &a, + ComplexArithmeticGraph::NodeType b) { + return (ComplexArithmeticGraph::NodeType &)((int &)a ^= (int)b); +} + +/** + * Creates a contiguous mask of the given length, optionally with a base offset. + */ +static ArrayRef createContiguousMask(int len, int offset = 0) { + int *Arr = new int[len]; + for (unsigned j = 0; j < len; j++) + Arr[j] = j + offset; + return ArrayRef(Arr, len); +} + +/** + * Creates an interleaving mask of the given length. + */ +static ArrayRef createInterleavingMask(int len) { + int Step = len / 2; + int *Arr = new int[len]; + int idx = 0; + for (unsigned j = 0; j < len; j += 2) { + Arr[j] = idx; + Arr[j + 1] = idx + Step; + idx++; + }; + return ArrayRef(Arr, len); +} + +}; // namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_COMPLEXARITHMETIC_H diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -507,6 +507,23 @@ return TTIImpl->supportsEfficientVectorElementLoadStore(); } +bool TargetTransformInfo::supportsComplexArithmetic() const { + return TTIImpl->supportsComplexArithmetic(); +} + +Value * +TargetTransformInfo::createComplexArithmeticIR(ComplexArithmeticGraph &G, + Value *InputA, Value *InputB, + int &GeneratedIntrinsicCount) { + return TTIImpl->createComplexArithmeticIR(G, InputA, InputB, + GeneratedIntrinsicCount); +} + +bool TargetTransformInfo::matchComplexArithmeticIR(Instruction *I, + ComplexArithmeticGraph &G) { + return TTIImpl->matchComplexArithmeticIR(I, G); +} + bool TargetTransformInfo::enableAggressiveInterleaving( bool LoopHasReductions) const { return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -145,6 +145,7 @@ #include "llvm/Transforms/Scalar/AnnotationRemarks.h" #include "llvm/Transforms/Scalar/BDCE.h" #include "llvm/Transforms/Scalar/CallSiteSplitting.h" +#include "llvm/Transforms/Scalar/ComplexArithmetic.h" #include "llvm/Transforms/Scalar/ConstantHoisting.h" #include "llvm/Transforms/Scalar/ConstraintElimination.h" #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -241,6 +241,7 @@ FUNCTION_PASS("bounds-checking", BoundsCheckingPass()) FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass()) FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass()) +FUNCTION_PASS("complex-arithmetic", ComplexArithmeticPass()) FUNCTION_PASS("consthoist", ConstantHoistingPass()) FUNCTION_PASS("constraint-elimination", ConstraintEliminationPass()) FUNCTION_PASS("chr", ControlHeightReductionPass()) diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -423,12 +423,17 @@ TargetPassConfig::addIRPasses(); // Run the parallel DSP pass. - if (getOptLevel() == CodeGenOpt::Aggressive) + if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createARMParallelDSPPass()); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexArithmeticPass()); + // Match interleaved memory accesses to ldN/stN intrinsics. - if (TM->getOptLevel() != CodeGenOpt::None) + if (TM->getOptLevel() != CodeGenOpt::None) { addPass(createInterleavedAccessPass()); + } // Add Control Flow Guard checks. if (TM->getTargetTriple().isOSWindows()) diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -126,6 +126,11 @@ std::function SimplifyAndSetOp) const; + bool supportsComplexArithmetic() const; + Value *createComplexArithmeticIR(ComplexArithmeticGraph &G, Value *InputA, + Value *InputB, int &GeneratedIntrinsicCount); + bool matchComplexArithmeticIR(Instruction *I, ComplexArithmeticGraph &G); + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -20,8 +20,8 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsARM.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" @@ -31,6 +31,7 @@ #include "llvm/Support/MachineValueType.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" +#include "llvm/Transforms/Scalar/ComplexArithmetic.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include @@ -2344,3 +2345,169 @@ return false; return true; } + +bool ARMTTIImpl::supportsComplexArithmetic() const { + return ST->hasMVEFloatOps(); +} + +namespace { +struct ARMComplexArithmeticMetadata { + bool Halving; +}; +} // namespace + +Value *ARMTTIImpl::createComplexArithmeticIR(ComplexArithmeticGraph &G, + Value *InputA, Value *InputB, + int &GeneratedIntrinsicCount) { + auto *Ty = InputA->getType(); + if (!isa(Ty)) + return nullptr; + auto *VTy = cast(Ty); + + // Cannot widen complex intrinsics to fill vectors + if (VTy->getNumElements() * VTy->getScalarSizeInBits() != 128) + return nullptr; + + // MVE does not support double complex operations + if (VTy->getScalarType()->isDoubleTy()) + return nullptr; + + if (G.getType() == ComplexArithmeticGraph::Complex_Mul) { + + IRBuilder<> B(G.getCurrentInstruction()); + auto *IntTy = Type::getInt32Ty(G.getCurrentInstruction()->getContext()); + int RotIdx = G.getRotation() / 90; + + auto *ConstMulRot = ConstantInt::get(IntTy, RotIdx); + auto *ConstMlaRot = ConstantInt::get(IntTy, (RotIdx + 1) % 4); + auto *Mul = B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty, + {ConstMulRot, InputA, InputB}); + auto *Mla = B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty, + {ConstMlaRot, Mul, InputA, InputB}); + GeneratedIntrinsicCount = 2; + return Mla; + } + + if (G.getType() == ComplexArithmeticGraph::Complex_Add) { + IRBuilder<> B(G.getCurrentInstruction()); + + auto *IntTy = Type::getInt32Ty(G.getContext()); + unsigned HalvingVal = 1; + + auto *Meta = G.getUserData(); + if (Meta && Meta->Halving) + HalvingVal = 0; + + auto *Halving = ConstantInt::get(IntTy, HalvingVal); + + unsigned Rotation = G.getRotation(); + unsigned RotKey; + if (Rotation == 90) + RotKey = 0; + else if (G.getRotation() == 270) + RotKey = 1; + else + return nullptr; // Invalid rotation for arm_mve_vcaddq + + auto *RotVal = ConstantInt::get(IntTy, RotKey); + GeneratedIntrinsicCount = 1; + return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty, + {Halving, RotVal, InputA, InputB}); + } + + return nullptr; +} + +static bool matchComplexArithmeticHalvingAdd(Instruction *I, + ComplexArithmeticGraph &G) { + auto *SVI = dyn_cast(I); + if (!SVI) + return false; + + if (!SVI->getType()->getElementType()->isIntegerTy(32)) + return false; + + Value *ShuffleAR = nullptr; + Value *ShuffleAI = nullptr; + Value *ShuffleBR = nullptr; + Value *ShuffleBI = nullptr; + + auto *Op0 = cast(SVI->getOperand(0)); + auto *Op1 = cast(SVI->getOperand(1)); + + Op0 = cast(Op0->getOperand(0)); + Op1 = cast(Op1->getOperand(0)); + + unsigned Rotation; + if (Op0->getOpcode() == Instruction::FSub && + Op1->getOpcode() == Instruction::FAdd) { + Rotation = 90; + } else if (Op0->getOpcode() == Instruction::FAdd && + Op1->getOpcode() == Instruction::FSub) { + Rotation = 270; + } else { + return false; + } + + auto ShuffleMask = createInterleavingMask(SVI->getShuffleMask().size()); + + auto *FloatTy = Type::getFloatTy(G.getContext()); + + if (Rotation == 90) { + + auto FSubPattern = m_FSub(m_Value(ShuffleBR), m_Value(ShuffleAI)); + auto FAddPattern = m_FAdd(m_Value(ShuffleBI), m_Value(ShuffleAR)); + + auto *FP = ConstantFP::get(FixedVectorType::get(FloatTy, 4), 0.5f); + auto Mul0Pattern = m_FMul(FSubPattern, m_SpecificFP(0.5f)); + auto Mul1Pattern = m_FMul(FAddPattern, m_SpecificFP(0.5f)); + + if (!match(SVI, m_Shuffle(Mul0Pattern, Mul1Pattern, + m_SpecificMask(ShuffleMask)))) { + dbgs() + << "SVI does not match expected pattern for complex halving add rot " + << Rotation << ".\n"; + return false; + } + } else if (Rotation == 270) { + if (!match(SVI, m_Shuffle(m_FAdd(m_Value(ShuffleBI), m_Value(ShuffleAR)), + m_FSub(m_Value(ShuffleAI), m_Value(ShuffleBR)), + m_SpecificMask(ShuffleMask)))) { + dbgs() + << "SVI does not match expected pattern for complex halving add rot " + << Rotation << ".\n"; + return false; + } + } + + G.addNode(cast(ShuffleAR), + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + G.addNode(cast(ShuffleAI), + ComplexArithmeticGraph::Shuffle | + ComplexArithmeticGraph::Imaginary); + G.addNode(cast(ShuffleBR), + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + G.addNode(cast(ShuffleBI), + ComplexArithmeticGraph::Shuffle | + ComplexArithmeticGraph::Imaginary); + + G.addNode(Op0, ComplexArithmeticGraph::AddOperand); + G.addNode(Op1, ComplexArithmeticGraph::AddOperand); + + G.setType(ComplexArithmeticGraph::Complex_Add); + G.setRotation(Rotation); + auto *Meta = new ARMComplexArithmeticMetadata; + Meta->Halving = true; + G.setUserData(Meta); + + return true; +} + +bool ARMTTIImpl::matchComplexArithmeticIR(Instruction *I, + ComplexArithmeticGraph &G) { + // if (I->getType()->isIntOrIntVectorTy(32)) { + // if (matchComplexArithmeticHalvingAdd(I, G)) + // return true; + // } + return false; +} diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -4,6 +4,7 @@ AnnotationRemarks.cpp BDCE.cpp CallSiteSplitting.cpp + ComplexArithmetic.cpp ConstantHoisting.cpp ConstraintElimination.cpp CorrelatedValuePropagation.cpp diff --git a/llvm/lib/Transforms/Scalar/ComplexArithmetic.cpp b/llvm/lib/Transforms/Scalar/ComplexArithmetic.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/ComplexArithmetic.cpp @@ -0,0 +1,486 @@ +//===- ComplexArithmeticPass.cpp ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/ComplexArithmetic.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "complex-arithmetic" + +STATISTIC(NumComplexIntrinsics, "Number of complex intrinsics generated"); + +static cl::opt ComplexArithmeticEnabled( + "enable-complex-arithmetic", + cl::desc("Enable generation of complex arithmetic instructions"), + cl::init(true), cl::Hidden); + +namespace { + +class ComplexArithmeticLegacyPass : public FunctionPass { +public: + static char ID; + + ComplexArithmeticLegacyPass() : FunctionPass(ID) { + initializeComplexArithmeticLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { return "Complex Arithmetic Pass"; } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + } +}; + +class ComplexArithmetic { +public: + ComplexArithmetic(TargetTransformInfo *tti) : TTI(tti) {} + bool runOnFunction(Function &F); + +private: + bool + evaluateComplexArithmeticBasicBlock(BasicBlock *B, + SmallVector &DeadInsts); + + void cleanupDeadInsts(SmallVector &DeadInsts); + + TargetTransformInfo *TTI = nullptr; +}; + +}; // namespace + +char ComplexArithmeticLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ComplexArithmeticLegacyPass, DEBUG_TYPE, + "Complex Arithmetic", false, false) +INITIALIZE_PASS_END(ComplexArithmeticLegacyPass, DEBUG_TYPE, + "Complex Arithmetic", false, false) + +PreservedAnalyses ComplexArithmeticPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &TTI = AM.getResult(F); + if (!ComplexArithmetic(&TTI).runOnFunction(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} + +FunctionPass *llvm::createComplexArithmeticPass() { + return new ComplexArithmeticLegacyPass(); +} + +bool ComplexArithmeticLegacyPass::runOnFunction(Function &F) { + auto &TTI = getAnalysis().getTTI(F); + return ComplexArithmetic(&TTI).runOnFunction(F); +} + +static bool HasBeenDisabled = false; +bool ComplexArithmetic::runOnFunction(Function &F) { + LLVM_DEBUG(dbgs() << "ComplexArithmetic::runOnFunction" + << ".\n"); + + if (!ComplexArithmeticEnabled) { + LLVM_DEBUG(if (!HasBeenDisabled) dbgs() + << "Complex has been explicitly disabled.\n"); + return false; + } + + if (!TTI->supportsComplexArithmetic()) { + LLVM_DEBUG(if (!HasBeenDisabled) dbgs() + << "Complex has been disabled, " + "target does not support lowering of complex numbers.\n"); + return false; + } + + bool Changed = false; + SmallVector DeadInsts; + for (auto &B : F) + Changed |= evaluateComplexArithmeticBasicBlock(&B, DeadInsts); + + if (Changed) + cleanupDeadInsts(DeadInsts); + + return Changed; +} + +/** + * Checks the given mask, and determines whether said mask is interleaving. + * + * To be interleaving, a mask must alternate between `i` and `i + (Length / 2)`, + * and must contain all numbers within the range of `[0..Length)` + * (e.g. a 4x vector interleaving mask would be <0, 2, 1, 3>). + */ +static bool isInterleavingMask(ArrayRef Mask, int NumElements) { + if (Mask.size() != NumElements * 2) { + return false; + } + + for (unsigned i = 0; i < NumElements; ++i) { + if (Mask[(i * 2) + 1] != (Mask[i * 2] + NumElements)) { + return false; + } + } + + return true; +} + +/** + * Checks the mask of the given ShuffleVectorInst, and determines whether said + * shuffle is interleaving. See isInterleavingMask. + */ +static bool isInterleaving(ShuffleVectorInst *SVI) { + auto *Ty = dyn_cast(SVI->getOperand(0)->getType()); + if (!Ty) + return false; + + unsigned NumElements = Ty->getNumElements(); + return isInterleavingMask(SVI->getShuffleMask(), NumElements); +} + +/** + * Checks the given mask, and determines whether said mask is deinterleaving. + * + * To be deinterleaving, a mask must match the pattern `i * 2`, with an optional + * offset of 1. (e.g. a 4x vector deinterleaving mask would look like <0, 2, 4, + * 6> or <1, 3, 5, 7>). + */ +static bool isDeinterleavingMask(ArrayRef Mask, int NumElements) { + if (Mask.size() != NumElements) + return false; + + for (unsigned i = 0; i < Mask.size() - 1; ++i) { + if (Mask[i + 1] != (Mask[i] + NumElements)) + return false; + } + + return true; +} + +static bool matchComplexMul(ShuffleVectorInst *SVI, ComplexArithmeticGraph &G) { + + unsigned LikelyRotation = 0; + + Value *LeftShuffleAR; + Value *LeftShuffleAI; + Value *LeftShuffleBR; + Value *LeftShuffleBI; + + Value *RightShuffleAR; + Value *RightShuffleAI; + Value *RightShuffleBR; + Value *RightShuffleBI; + + auto Mask = createInterleavingMask(SVI->getShuffleMask().size()); + + auto InterleaveShuffleRot0Pattern = m_Shuffle( + m_FSub(m_FMul(m_Value(LeftShuffleBR), m_Value(LeftShuffleAR)), + m_FMul(m_Value(LeftShuffleBI), m_Value(LeftShuffleAI))), + m_FAdd(m_FMul(m_Value(RightShuffleBI), m_Value(RightShuffleAR)), + m_FMul(m_Value(RightShuffleBR), m_Value(RightShuffleAI))), + m_SpecificMask(Mask)); + + auto InterleaveShuffleRot180Pattern = m_Shuffle( + m_FSub(m_FMul(m_Value(LeftShuffleBI), m_Value(LeftShuffleAI)), + m_FMul(m_Value(LeftShuffleBR), m_Value(LeftShuffleAR))), + m_FSub(m_FMul(m_Value(RightShuffleBR), m_FNeg(m_Value(RightShuffleAI))), + m_FMul(m_Value(RightShuffleBI), m_Value(RightShuffleAR))), + m_SpecificMask(Mask)); + + if (match(SVI, InterleaveShuffleRot0Pattern)) + LikelyRotation = 0; + else if (match(SVI, InterleaveShuffleRot180Pattern)) + LikelyRotation = 180; + else { + LLVM_DEBUG(dbgs() << "SVI does not match expected patterns.\n"); + return false; + } + + if (LeftShuffleAR != RightShuffleAR) + return false; + if (LeftShuffleAI != RightShuffleAI) + return false; + if (LeftShuffleBR != RightShuffleBR) + return false; + if (LeftShuffleBI != RightShuffleBI) + return false; + + G.addNode(cast(LeftShuffleAR), + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + G.addNode(cast(LeftShuffleAI), + ComplexArithmeticGraph::Shuffle | + ComplexArithmeticGraph::Imaginary); + G.addNode(cast(LeftShuffleBR), + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + G.addNode(cast(LeftShuffleBI), + ComplexArithmeticGraph::Shuffle | + ComplexArithmeticGraph::Imaginary); + + auto *Op0 = cast(SVI->getOperand(0)); + auto *FAdd = cast(SVI->getOperand(1)); + + G.addNode(Op0, ComplexArithmeticGraph::Real); + G.addNode(FAdd, ComplexArithmeticGraph::Imaginary); + + G.addNode(cast(Op0->getOperand(0)), + ComplexArithmeticGraph::Discover); + G.addNode(cast(Op0->getOperand(1)), + ComplexArithmeticGraph::Discover); + G.addNode(cast(FAdd->getOperand(0)), + ComplexArithmeticGraph::Discover); + G.addNode(cast(FAdd->getOperand(1)), + ComplexArithmeticGraph::Discover); + + G.setType(ComplexArithmeticGraph::Complex_Mul); + G.setRotation(LikelyRotation); + + return true; +} + +static bool matchComplexAdd(ShuffleVectorInst *SVI, ComplexArithmeticGraph &G) { + Value *ShuffleAR; + Value *ShuffleAI; + Value *ShuffleBR; + Value *ShuffleBI; + + auto *Op0 = dyn_cast(SVI->getOperand(0)); + auto *Op1 = dyn_cast(SVI->getOperand(1)); + + if (!Op0 || !Op1) + return false; + + unsigned Rotation; + if (Op0->getOpcode() == Instruction::FSub && + Op1->getOpcode() == Instruction::FAdd) { + Rotation = 90; + } else if (Op0->getOpcode() == Instruction::FAdd && + Op1->getOpcode() == Instruction::FSub) { + Rotation = 270; + } else { + return false; + } + + auto ShuffleMask = createInterleavingMask(SVI->getShuffleMask().size()); + + if (Rotation == 90) { + if (!match(SVI, m_Shuffle(m_FSub(m_Value(ShuffleAR), m_Value(ShuffleBI)), + m_FAdd(m_Value(ShuffleAI), m_Value(ShuffleBR)), + m_SpecificMask(ShuffleMask)))) { + LLVM_DEBUG( + dbgs() << "SVI does not match expected pattern for complex add rot " + << Rotation << ".\n"); + return false; + } + } else if (Rotation == 270) { + if (!match(SVI, m_Shuffle(m_FAdd(m_Value(ShuffleBI), m_Value(ShuffleAR)), + m_FSub(m_Value(ShuffleAI), m_Value(ShuffleBR)), + m_SpecificMask(ShuffleMask)))) { + LLVM_DEBUG( + dbgs() << "SVI does not match expected pattern for complex add rot " + << Rotation << ".\n"); + return false; + } + } + + if (!isa(ShuffleAR) || + !isa(ShuffleAI) || + !isa(ShuffleBR) || + !isa(ShuffleAI)) { + LLVM_DEBUG(dbgs() << "SVI does not match expected pattern for complex add, " + "inputs aren't all shuffles.\n"); + return false; + } + + G.addNode(cast(ShuffleAR), + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + G.addNode(cast(ShuffleAI), + ComplexArithmeticGraph::Shuffle | + ComplexArithmeticGraph::Imaginary); + G.addNode(cast(ShuffleBR), + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + G.addNode(cast(ShuffleBI), + ComplexArithmeticGraph::Shuffle | + ComplexArithmeticGraph::Imaginary); + + G.addNode(Op0, ComplexArithmeticGraph::AddOperand); + G.addNode(Op1, ComplexArithmeticGraph::AddOperand); + + G.setType(ComplexArithmeticGraph::Complex_Add); + G.setRotation(Rotation); + + return true; +} + +static bool traverseAndPopulateGraph(TargetTransformInfo *TTI, Instruction *I, + ComplexArithmeticGraph &G) { + G.setCurrentInstruction(I); + + // Shuffle mask needs to interleave vectors + // e.g. + // <4 x i32> <0, 2, 1, 3> + // <8 x i32> <0, 4, 1, 5, 2, 6, 3, 7> + + if (auto *SVI = dyn_cast(I)) { + if (!isInterleaving(SVI)) { + LLVM_DEBUG(dbgs() << "SVI doesn't appear to perform interleaving" + << ".\n"); + return false; + } + + if (matchComplexMul(SVI, G)) + return true; + + if (matchComplexAdd(SVI, G)) + return true; + + if (TTI->matchComplexArithmeticIR(SVI, G)) + return true; + } + + return false; +} + +static bool substituteGraph(TargetTransformInfo *TTI, Instruction *I, + ComplexArithmeticGraph &G, + SmallVector &DeadInsts) { + G.setCurrentInstruction(I); + + SmallVector RealShuffles = G.getNodesOfType( + ComplexArithmeticGraph::Shuffle | ComplexArithmeticGraph::Real); + + auto *LoadA = RealShuffles[0]->getOperand(0); + auto *LoadB = RealShuffles[1]->getOperand(0); + + auto *TyA = cast(LoadA->getType()); + + const unsigned MaxVectorWidth = 128; + unsigned NumBits = TyA->getScalarSizeInBits() * TyA->getNumElements(); + unsigned NumElementsPerVector = MaxVectorWidth / TyA->getScalarSizeInBits(); + int GeneratedIntrinsics; + if (NumBits > MaxVectorWidth) { + LLVM_DEBUG(dbgs() << "Split required, " << NumBits + << " is greater than the max vector width (" + << MaxVectorWidth << ")" + << ".\n"); + if (NumBits % MaxVectorWidth != 0) { + LLVM_DEBUG(dbgs() << "Vector can't be split evenly" + << ".\n"); + return false; + } + + IRBuilder<> B(I); + + unsigned SplitCount = NumBits / MaxVectorWidth; + + if (SplitCount > 2) { + LLVM_DEBUG(dbgs() << "Cannot split operation beyond 2" + << ".\n"); + return false; + } + + SmallVector CreatedInsts; + SmallVector ComplexIR; + for (unsigned i = 0; i < SplitCount; ++i) { + ArrayRef Mask = + createContiguousMask(NumElementsPerVector, NumElementsPerVector * i); + auto *Undef = UndefValue::get(LoadA->getType()); + auto *ShuffleA = B.CreateShuffleVector(LoadA, Undef, Mask); + auto *ShuffleB = B.CreateShuffleVector(LoadB, Undef, Mask); + + CreatedInsts.push_back(ShuffleA); + CreatedInsts.push_back(ShuffleB); + + auto *IR = TTI->createComplexArithmeticIR(G, ShuffleA, ShuffleB, + GeneratedIntrinsics); + if (IR == nullptr) { + for (auto &item : CreatedInsts) + DeadInsts.push_back(cast(item)); + return false; + } + NumComplexIntrinsics += GeneratedIntrinsics; + ComplexIR.push_back(IR); + CreatedInsts.push_back(IR); + } + + ArrayRef Mask = createContiguousMask(NumElementsPerVector * 2); + auto *Shuffle = B.CreateShuffleVector(ComplexIR[0], ComplexIR[1], Mask); + I->replaceAllUsesWith(Shuffle); + } else { + auto *Mla = + TTI->createComplexArithmeticIR(G, LoadA, LoadB, GeneratedIntrinsics); + if (Mla == nullptr) + return false; + NumComplexIntrinsics += GeneratedIntrinsics; + I->replaceAllUsesWith(Mla); + } + + for (auto &item : G.getAllNodes()) + DeadInsts.push_back(item); + + return true; +} + +bool ComplexArithmetic::evaluateComplexArithmeticBasicBlock( + BasicBlock *B, SmallVector &DeadInsts) { + ComplexArithmeticGraph Graph; + + bool Changed = false; + bool Substituted = false; + + for (auto &I : *B) { + if (auto *SVI = dyn_cast(&I)) { + if (isInterleaving(SVI)) { + Graph.addNode(SVI, ComplexArithmeticGraph::Shuffle); + Changed = traverseAndPopulateGraph(TTI, SVI, Graph); + } + } + if (Changed) { + Substituted = substituteGraph(TTI, &I, Graph, DeadInsts); + Changed = false; + } + } + + return Substituted; +} + +void ComplexArithmetic::cleanupDeadInsts( + SmallVector &DeadInsts) { + + // TODO clean up the dead instructions better. (Ask in review?) + unsigned iter = 0; + unsigned count = DeadInsts.size(); + unsigned remaining = DeadInsts.size(); + while (!DeadInsts.empty() && remaining > 0 && iter < count) { + ++iter; + remaining = 0; + for (auto *It = DeadInsts.begin(); It != DeadInsts.end(); It++) { + auto *I = *It; + + if (I->getParent()) + remaining++; + + if (I->getNumUses() == 0 && I->getParent()) { + remaining--; + I->eraseFromParent(); + } + } + } + + DeadInsts.clear(); +} diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -45,6 +45,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Transform functions to use DSP intrinsics +; CHECK-NEXT: Complex Arithmetic Pass ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: Type Promotion ; CHECK-NEXT: CodeGen Prepare diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-add.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-add.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; STATS: "complex-arithmetic.NumComplexIntrinsics": 3 + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_add_v2f16(<2 x half> %wide.vec, <2 x half> %wide.vec23, <2 x half>* %lsr.iv5153) #0 { +; CHECK-LABEL: complex_add_v2f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: ldr r0, [sp, #16] +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vsub.f16 s4, s0, s2 +; CHECK-NEXT: vadd.f16 s2, s2, s0 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv51531 = bitcast { half, half }* null to <2 x half>* + %wide.vec2 = load <2 x half>, <2 x half>* null, align 4 + %strided.vec = shufflevector <2 x half> %wide.vec, <2 x half> zeroinitializer, <1 x i32> + %strided.vec22 = shufflevector <2 x half> %wide.vec, <2 x half> zeroinitializer, <1 x i32> + %wide.vec233 = load <2 x half>, <2 x half>* null, align 4 + %strided.vec24 = shufflevector <2 x half> %wide.vec, <2 x half> zeroinitializer, <1 x i32> + %strided.vec25 = shufflevector <2 x half> %wide.vec, <2 x half> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x half> %strided.vec24, %strided.vec22 + %1 = fadd fast <1 x half> %strided.vec25, %strided.vec + %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> + store <2 x half> %interleaved.vec, <2 x half>* %lsr.iv5153, align 4 + br label %vector.body +} + +define void @complex_add_v4f16(<4 x half> %wide.vec, <4 x half> %wide.vec23, <4 x half>* %lsr.iv5153) #0 { +; CHECK-LABEL: complex_add_v4f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: ldr r0, [sp, #16] +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vmovx.f16 s4, s0 +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vins.f16 s4, s2 +; CHECK-NEXT: vins.f16 s8, s1 +; CHECK-NEXT: vsub.f16 q3, q2, q1 +; CHECK-NEXT: vadd.f16 q1, q1, q2 +; CHECK-NEXT: vmovx.f16 s13, s12 +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vins.f16 s13, s2 +; CHECK-NEXT: vmov r1, r2, d6 +; CHECK-NEXT: strd r1, r2, [r0] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv51531 = bitcast { half, half }* null to <4 x half>* + %wide.vec2 = load <4 x half>, <4 x half>* null, align 4 + %strided.vec = shufflevector <4 x half> %wide.vec, <4 x half> zeroinitializer, <2 x i32> + %strided.vec22 = shufflevector <4 x half> %wide.vec, <4 x half> zeroinitializer, <2 x i32> + %wide.vec233 = load <4 x half>, <4 x half>* null, align 4 + %strided.vec24 = shufflevector <4 x half> %wide.vec, <4 x half> zeroinitializer, <2 x i32> + %strided.vec25 = shufflevector <4 x half> %wide.vec, <4 x half> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x half> %strided.vec24, %strided.vec22 + %1 = fadd fast <2 x half> %strided.vec25, %strided.vec + %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> + store <4 x half> %interleaved.vec, <4 x half>* %lsr.iv5153, align 4 + br label %vector.body +} + +define void @complex_add_v8f16(<8 x half> %wide.vec, <8 x half> %wide.vec23, <8 x half>* %lsr.iv5153) #0 { +; CHECK-LABEL: complex_add_v8f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: ldr r0, [sp, #16] +; CHECK-NEXT: vcadd.f16 q0, q0, q0, #90 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: b .LBB2_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv51531 = bitcast { half, half }* null to <8 x half>* + %wide.vec2 = load <8 x half>, <8 x half>* null, align 4 + %strided.vec = shufflevector <8 x half> %wide.vec, <8 x half> zeroinitializer, <4 x i32> + %strided.vec22 = shufflevector <8 x half> %wide.vec, <8 x half> zeroinitializer, <4 x i32> + %wide.vec233 = load <8 x half>, <8 x half>* null, align 4 + %strided.vec24 = shufflevector <8 x half> %wide.vec, <8 x half> zeroinitializer, <4 x i32> + %strided.vec25 = shufflevector <8 x half> %wide.vec, <8 x half> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x half> %strided.vec24, %strided.vec22 + %1 = fadd fast <4 x half> %strided.vec25, %strided.vec + %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> + store <8 x half> %interleaved.vec, <8 x half>* %lsr.iv5153, align 4 + br label %vector.body +} + +define void @complex_add_v16f16(<16 x half> %wide.vec, <16 x half> %wide.vec23, <16 x half>* %lsr.iv5153) #0 { +; CHECK-LABEL: complex_add_v16f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: ldr r0, [sp, #48] +; CHECK-NEXT: vcadd.f16 q0, q0, q0, #90 +; CHECK-NEXT: vcadd.f16 q1, q1, q1, #90 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q1, [r0, #16] +; CHECK-NEXT: b .LBB3_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv51531 = bitcast { half, half }* null to <16 x half>* + %wide.vec2 = load <16 x half>, <16 x half>* null, align 4 + %strided.vec = shufflevector <16 x half> %wide.vec, <16 x half> zeroinitializer, <8 x i32> + %strided.vec22 = shufflevector <16 x half> %wide.vec, <16 x half> zeroinitializer, <8 x i32> + %wide.vec233 = load <16 x half>, <16 x half>* null, align 4 + %strided.vec24 = shufflevector <16 x half> %wide.vec, <16 x half> zeroinitializer, <8 x i32> + %strided.vec25 = shufflevector <16 x half> %wide.vec, <16 x half> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x half> %strided.vec24, %strided.vec22 + %1 = fadd fast <8 x half> %strided.vec25, %strided.vec + %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> + store <16 x half> %interleaved.vec, <16 x half>* %lsr.iv5153, align 4 + br label %vector.body +} diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-mul.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f16-mul.ll @@ -0,0 +1,194 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; STATS: "complex-arithmetic.NumComplexIntrinsics": 6 + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_mul_v2f16(half* %a, half* %b, half* %c) #0 { +; CHECK-LABEL: complex_mul_v2f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: ldr r3, [r1] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q1[0], r3 +; CHECK-NEXT: vmovx.f16 s2, s0 +; CHECK-NEXT: vmovx.f16 s6, s4 +; CHECK-NEXT: vmul.f16 s8, s6, s2 +; CHECK-NEXT: vmul.f16 s2, s4, s2 +; CHECK-NEXT: vfnms.f16 s8, s4, s0 +; CHECK-NEXT: vfma.f16 s2, s6, s0 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast half* %a to <2 x half>* + %b.ptr = bitcast half* %b to <2 x half>* + %c.ptr = bitcast half* %c to <2 x half>* + %a.val = load <2 x half>, <2 x half>* %a.ptr + %b.val = load <2 x half>, <2 x half>* %b.ptr + %strided.vec = shufflevector <2 x half> %a.val, <2 x half> poison, <1 x i32> + %strided.vec46 = shufflevector <2 x half> %a.val, <2 x half> poison, <1 x i32> + %strided.vec48 = shufflevector <2 x half> %b.val, <2 x half> poison, <1 x i32> + %strided.vec49 = shufflevector <2 x half> %b.val, <2 x half> poison, <1 x i32> + %0 = fmul fast <1 x half> %strided.vec48, %strided.vec + %1 = fmul fast <1 x half> %strided.vec49, %strided.vec46 + %2 = fsub fast <1 x half> %0, %1 + %3 = fmul fast <1 x half> %strided.vec49, %strided.vec + %4 = fmul fast <1 x half> %strided.vec48, %strided.vec46 + %5 = fadd fast <1 x half> %3, %4 + %6 = bitcast half* undef to <2 x half>* + %interleaved.vec = shufflevector <1 x half> %2, <1 x half> %5, <2 x i32> + store <2 x half> %interleaved.vec, <2 x half>* %6, align 4 + br label %vector.body +} + +define void @complex_mul_v4f16(half* %a, half* %b, half* %c) #0 { +; CHECK-LABEL: complex_mul_v4f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldrd r2, r12, [r0] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: ldrd r2, r3, [r1] +; CHECK-NEXT: vmov.32 q0[1], r12 +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[1], r3 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s16, s4 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s16, s2 +; CHECK-NEXT: vmul.f16 q3, q4, q2 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vneg.f16 q3, q3 +; CHECK-NEXT: vfma.f16 q3, q1, q0 +; CHECK-NEXT: vmul.f16 q1, q1, q2 +; CHECK-NEXT: vfma.f16 q1, q4, q0 +; CHECK-NEXT: vmovx.f16 s13, s12 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vins.f16 s12, s4 +; CHECK-NEXT: vins.f16 s13, s0 +; CHECK-NEXT: vmov r2, r3, d6 +; CHECK-NEXT: str r3, [r0] +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast half* %a to <4 x half>* + %b.ptr = bitcast half* %b to <4 x half>* + %c.ptr = bitcast half* %c to <4 x half>* + %a.val = load <4 x half>, <4 x half>* %a.ptr + %b.val = load <4 x half>, <4 x half>* %b.ptr + %strided.vec = shufflevector <4 x half> %a.val, <4 x half> poison, <2 x i32> + %strided.vec46 = shufflevector <4 x half> %a.val, <4 x half> poison, <2 x i32> + %strided.vec48 = shufflevector <4 x half> %b.val, <4 x half> poison, <2 x i32> + %strided.vec49 = shufflevector <4 x half> %b.val, <4 x half> poison, <2 x i32> + %0 = fmul fast <2 x half> %strided.vec48, %strided.vec + %1 = fmul fast <2 x half> %strided.vec49, %strided.vec46 + %2 = fsub fast <2 x half> %0, %1 + %3 = fmul fast <2 x half> %strided.vec49, %strided.vec + %4 = fmul fast <2 x half> %strided.vec48, %strided.vec46 + %5 = fadd fast <2 x half> %3, %4 + %6 = bitcast half* undef to <4 x half>* + %interleaved.vec = shufflevector <2 x half> %2, <2 x half> %5, <4 x i32> + store <4 x half> %interleaved.vec, <4 x half>* %6, align 4 + br label %vector.body +} + +define void @complex_mul_v8f16(half* %a, half* %b, half* %c) #0 { +; CHECK-LABEL: complex_mul_v8f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vcmul.f16 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f16 q2, q0, q1, #90 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: b .LBB2_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast half* %a to <8 x half>* + %b.ptr = bitcast half* %b to <8 x half>* + %c.ptr = bitcast half* %c to <8 x half>* + %a.val = load <8 x half>, <8 x half>* %a.ptr + %b.val = load <8 x half>, <8 x half>* %b.ptr + %strided.vec = shufflevector <8 x half> %a.val, <8 x half> poison, <4 x i32> + %strided.vec46 = shufflevector <8 x half> %a.val, <8 x half> poison, <4 x i32> + %strided.vec48 = shufflevector <8 x half> %b.val, <8 x half> poison, <4 x i32> + %strided.vec49 = shufflevector <8 x half> %b.val, <8 x half> poison, <4 x i32> + %0 = fmul fast <4 x half> %strided.vec48, %strided.vec + %1 = fmul fast <4 x half> %strided.vec49, %strided.vec46 + %2 = fsub fast <4 x half> %0, %1 + %3 = fmul fast <4 x half> %strided.vec49, %strided.vec + %4 = fmul fast <4 x half> %strided.vec48, %strided.vec46 + %5 = fadd fast <4 x half> %3, %4 + %6 = bitcast half* undef to <8 x half>* + %interleaved.vec = shufflevector <4 x half> %2, <4 x half> %5, <8 x i32> + store <8 x half> %interleaved.vec, <8 x half>* %6, align 4 + br label %vector.body +} + +define void @complex_mul_v16f16(half* %a, half* %b, half* %c) #0 { +; CHECK-LABEL: complex_mul_v16f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r1, #16] +; CHECK-NEXT: vcmul.f16 q4, q1, q2, #0 +; CHECK-NEXT: vcmla.f16 q4, q1, q2, #90 +; CHECK-NEXT: vcmul.f16 q1, q0, q3, #0 +; CHECK-NEXT: vcmla.f16 q1, q0, q3, #90 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q4, [r0] +; CHECK-NEXT: b .LBB3_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast half* %a to <16 x half>* + %b.ptr = bitcast half* %b to <16 x half>* + %c.ptr = bitcast half* %c to <16 x half>* + %a.val = load <16 x half>, <16 x half>* %a.ptr + %b.val = load <16 x half>, <16 x half>* %b.ptr + %strided.vec = shufflevector <16 x half> %a.val, <16 x half> poison, <8 x i32> + %strided.vec46 = shufflevector <16 x half> %a.val, <16 x half> poison, <8 x i32> + %strided.vec48 = shufflevector <16 x half> %b.val, <16 x half> poison, <8 x i32> + %strided.vec49 = shufflevector <16 x half> %b.val, <16 x half> poison, <8 x i32> + %0 = fmul fast <8 x half> %strided.vec48, %strided.vec + %1 = fmul fast <8 x half> %strided.vec49, %strided.vec46 + %2 = fsub fast <8 x half> %0, %1 + %3 = fmul fast <8 x half> %strided.vec49, %strided.vec + %4 = fmul fast <8 x half> %strided.vec48, %strided.vec46 + %5 = fadd fast <8 x half> %3, %4 + %6 = bitcast half* undef to <16 x half>* + %interleaved.vec = shufflevector <8 x half> %2, <8 x half> %5, <16 x i32> + store <16 x half> %interleaved.vec, <16 x half>* %6, align 4 + br label %vector.body +} diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-add.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-add.ll @@ -0,0 +1,173 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; STATS: "complex-arithmetic.NumComplexIntrinsics": 3 + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_add_v2f32(<2 x float> %wide.vec, <2 x float> %wide.vec23, <2 x float>* %lsr.iv5153) #0 { +; CHECK-LABEL: complex_add_v2f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: ldr r0, [sp, #16] +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsub.f32 s2, s0, s1 +; CHECK-NEXT: vadd.f32 s4, s1, s0 +; CHECK-NEXT: vstr s2, [r0] +; CHECK-NEXT: vstr s4, [r0, #4] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv51531 = bitcast { float, float }* null to <2 x float>* + %wide.vec2 = load <2 x float>, <2 x float>* null, align 4 + %strided.vec = shufflevector <2 x float> %wide.vec, <2 x float> zeroinitializer, <1 x i32> + %strided.vec22 = shufflevector <2 x float> %wide.vec, <2 x float> zeroinitializer, <1 x i32> + %wide.vec233 = load <2 x float>, <2 x float>* null, align 4 + %strided.vec24 = shufflevector <2 x float> %wide.vec, <2 x float> zeroinitializer, <1 x i32> + %strided.vec25 = shufflevector <2 x float> %wide.vec, <2 x float> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x float> %strided.vec24, %strided.vec22 + %1 = fadd fast <1 x float> %strided.vec25, %strided.vec + %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> + store <2 x float> %interleaved.vec, <2 x float>* %lsr.iv5153, align 4 + br label %vector.body +} + +define void @complex_add_v4f32(<4 x float> %wide.vec, <4 x float> %wide.vec23, <4 x float>* %lsr.iv5153) #0 { +; CHECK-LABEL: complex_add_v4f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: vmov d3, r2, r3 +; CHECK-NEXT: vmov d2, r0, r1 +; CHECK-NEXT: ldr r0, [sp, #16] +; CHECK-NEXT: vcadd.f32 q0, q1, q1, #90 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv51531 = bitcast { float, float }* null to <4 x float>* + %wide.vec2 = load <4 x float>, <4 x float>* null, align 4 + %strided.vec = shufflevector <4 x float> %wide.vec, <4 x float> zeroinitializer, <2 x i32> + %strided.vec22 = shufflevector <4 x float> %wide.vec, <4 x float> zeroinitializer, <2 x i32> + %wide.vec233 = load <4 x float>, <4 x float>* null, align 4 + %strided.vec24 = shufflevector <4 x float> %wide.vec, <4 x float> zeroinitializer, <2 x i32> + %strided.vec25 = shufflevector <4 x float> %wide.vec, <4 x float> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x float> %strided.vec24, %strided.vec22 + %1 = fadd fast <2 x float> %strided.vec25, %strided.vec + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + store <4 x float> %interleaved.vec, <4 x float>* %lsr.iv5153, align 4 + br label %vector.body +} + +define void @complex_add_v8f32(<8 x float> %wide.vec, <8 x float> %wide.vec23, <8 x float>* %lsr.iv5153) #0 { +; CHECK-LABEL: complex_add_v8f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: vmov d2, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmov d3, r2, r3 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: ldr r0, [sp, #48] +; CHECK-NEXT: vcadd.f32 q0, q1, q1, #90 +; CHECK-NEXT: vcadd.f32 q1, q2, q2, #90 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q1, [r0, #16] +; CHECK-NEXT: b .LBB2_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv51531 = bitcast { float, float }* null to <8 x float>* + %wide.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %wide.vec, <8 x float> zeroinitializer, <4 x i32> + %strided.vec22 = shufflevector <8 x float> %wide.vec, <8 x float> zeroinitializer, <4 x i32> + %wide.vec233 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec24 = shufflevector <8 x float> %wide.vec, <8 x float> zeroinitializer, <4 x i32> + %strided.vec25 = shufflevector <8 x float> %wide.vec, <8 x float> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x float> %strided.vec24, %strided.vec22 + %1 = fadd fast <4 x float> %strided.vec25, %strided.vec + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %lsr.iv5153, align 4 + br label %vector.body +} + +define void @complex_add_v16f32(<16 x float> %wide.vec, <16 x float> %wide.vec23, <16 x float>* %lsr.iv5153) #0 { +; CHECK-LABEL: complex_add_v16f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: vmov d1, r2, r3 +; CHECK-NEXT: add r2, sp, #112 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: add r0, sp, #144 +; CHECK-NEXT: add r1, sp, #128 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: ldr r0, [sp, #224] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q3, [r2] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s0 +; CHECK-NEXT: vmov.f32 s19, s2 +; CHECK-NEXT: vmov.f32 s22, s1 +; CHECK-NEXT: vmov.f32 s23, s3 +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s16, s8 +; CHECK-NEXT: vmov.f32 s17, s10 +; CHECK-NEXT: vmov.f32 s20, s9 +; CHECK-NEXT: vmov.f32 s21, s11 +; CHECK-NEXT: vsub.f32 q6, q4, q5 +; CHECK-NEXT: vadd.f32 q7, q5, q4 +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vmov.f32 s17, s2 +; CHECK-NEXT: vmov.f32 s18, s12 +; CHECK-NEXT: vmov.f32 s19, s14 +; CHECK-NEXT: vmov.f32 s20, s5 +; CHECK-NEXT: vmov.f32 s21, s3 +; CHECK-NEXT: vmov.f32 s22, s13 +; CHECK-NEXT: vmov.f32 s23, s15 +; CHECK-NEXT: vsub.f32 q0, q4, q5 +; CHECK-NEXT: vadd.f32 q1, q5, q4 +; CHECK-NEXT: mov r1, r0 +; CHECK-NEXT: vst20.32 {q0, q1}, [r0] +; CHECK-NEXT: vst21.32 {q0, q1}, [r1]! +; CHECK-NEXT: vst20.32 {q6, q7}, [r1] +; CHECK-NEXT: vst21.32 {q6, q7}, [r1] +; CHECK-NEXT: b .LBB3_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv51531 = bitcast { float, float }* null to <16 x float>* + %wide.vec2 = load <16 x float>, <16 x float>* null, align 4 + %strided.vec = shufflevector <16 x float> %wide.vec, <16 x float> zeroinitializer, <8 x i32> + %strided.vec22 = shufflevector <16 x float> %wide.vec, <16 x float> zeroinitializer, <8 x i32> + %wide.vec233 = load <16 x float>, <16 x float>* null, align 4 + %strided.vec24 = shufflevector <16 x float> %wide.vec, <16 x float> zeroinitializer, <8 x i32> + %strided.vec25 = shufflevector <16 x float> %wide.vec, <16 x float> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x float> %strided.vec24, %strided.vec22 + %1 = fadd fast <8 x float> %strided.vec25, %strided.vec + %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> + store <16 x float> %interleaved.vec, <16 x float>* %lsr.iv5153, align 4 + br label %vector.body +} diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-mul.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f32-mul.ll @@ -0,0 +1,197 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; STATS: "complex-arithmetic.NumComplexIntrinsics": 6 + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_mul_v2f32(float* %a, float* %b, float* %c) #0 { +; CHECK-LABEL: complex_mul_v2f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldr s2, [r0, #4] +; CHECK-NEXT: vldr s4, [r1, #4] +; CHECK-NEXT: vldr s6, [r1] +; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vmul.f32 s8, s4, s2 +; CHECK-NEXT: vmul.f32 s2, s6, s2 +; CHECK-NEXT: vfma.f32 s2, s4, s0 +; CHECK-NEXT: vfnms.f32 s8, s6, s0 +; CHECK-NEXT: vstr s2, [r0] +; CHECK-NEXT: vstr s8, [r0] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast float* %a to <2 x float>* + %b.ptr = bitcast float* %b to <2 x float>* + %c.ptr = bitcast float* %c to <2 x float>* + %a.val = load <2 x float>, <2 x float>* %a.ptr + %b.val = load <2 x float>, <2 x float>* %b.ptr + %strided.vec = shufflevector <2 x float> %a.val, <2 x float> poison, <1 x i32> + %strided.vec46 = shufflevector <2 x float> %a.val, <2 x float> poison, <1 x i32> + %strided.vec48 = shufflevector <2 x float> %b.val, <2 x float> poison, <1 x i32> + %strided.vec49 = shufflevector <2 x float> %b.val, <2 x float> poison, <1 x i32> + %0 = fmul fast <1 x float> %strided.vec48, %strided.vec + %1 = fmul fast <1 x float> %strided.vec49, %strided.vec46 + %2 = fsub fast <1 x float> %0, %1 + %3 = fmul fast <1 x float> %strided.vec49, %strided.vec + %4 = fmul fast <1 x float> %strided.vec48, %strided.vec46 + %5 = fadd fast <1 x float> %3, %4 + %6 = bitcast float* undef to <2 x float>* + %interleaved.vec = shufflevector <1 x float> %2, <1 x float> %5, <2 x i32> + store <2 x float> %interleaved.vec, <2 x float>* %6, align 4 + br label %vector.body +} + +define void @complex_mul_v4f32(float* %a, float* %b, float* %c) #0 { +; CHECK-LABEL: complex_mul_v4f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast float* %a to <4 x float>* + %b.ptr = bitcast float* %b to <4 x float>* + %c.ptr = bitcast float* %c to <4 x float>* + %a.val = load <4 x float>, <4 x float>* %a.ptr + %b.val = load <4 x float>, <4 x float>* %b.ptr + %strided.vec = shufflevector <4 x float> %a.val, <4 x float> poison, <2 x i32> + %strided.vec46 = shufflevector <4 x float> %a.val, <4 x float> poison, <2 x i32> + %strided.vec48 = shufflevector <4 x float> %b.val, <4 x float> poison, <2 x i32> + %strided.vec49 = shufflevector <4 x float> %b.val, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec48, %strided.vec + %1 = fmul fast <2 x float> %strided.vec49, %strided.vec46 + %2 = fsub fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %strided.vec49, %strided.vec + %4 = fmul fast <2 x float> %strided.vec48, %strided.vec46 + %5 = fadd fast <2 x float> %3, %4 + %6 = bitcast float* undef to <4 x float>* + %interleaved.vec = shufflevector <2 x float> %2, <2 x float> %5, <4 x i32> + store <4 x float> %interleaved.vec, <4 x float>* %6, align 4 + br label %vector.body +} + +define void @complex_mul_v8f32(float* %a, float* %b, float* %c) #0 { +; CHECK-LABEL: complex_mul_v8f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q3, [r1, #16] +; CHECK-NEXT: vcmul.f32 q4, q1, q2, #0 +; CHECK-NEXT: vcmla.f32 q4, q1, q2, #90 +; CHECK-NEXT: vcmul.f32 q1, q0, q3, #0 +; CHECK-NEXT: vcmla.f32 q1, q0, q3, #90 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q4, [r0] +; CHECK-NEXT: b .LBB2_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast float* %a to <8 x float>* + %b.ptr = bitcast float* %b to <8 x float>* + %c.ptr = bitcast float* %c to <8 x float>* + %a.val = load <8 x float>, <8 x float>* %a.ptr + %b.val = load <8 x float>, <8 x float>* %b.ptr + %strided.vec = shufflevector <8 x float> %a.val, <8 x float> poison, <4 x i32> + %strided.vec46 = shufflevector <8 x float> %a.val, <8 x float> poison, <4 x i32> + %strided.vec48 = shufflevector <8 x float> %b.val, <8 x float> poison, <4 x i32> + %strided.vec49 = shufflevector <8 x float> %b.val, <8 x float> poison, <4 x i32> + %0 = fmul fast <4 x float> %strided.vec48, %strided.vec + %1 = fmul fast <4 x float> %strided.vec49, %strided.vec46 + %2 = fsub fast <4 x float> %0, %1 + %3 = fmul fast <4 x float> %strided.vec49, %strided.vec + %4 = fmul fast <4 x float> %strided.vec48, %strided.vec46 + %5 = fadd fast <4 x float> %3, %4 + %6 = bitcast float* undef to <8 x float>* + %interleaved.vec = shufflevector <4 x float> %2, <4 x float> %5, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %6, align 4 + br label %vector.body +} + +define void @complex_mul_v16f32(float* %a, float* %b, float* %c) #0 { +; CHECK-LABEL: complex_mul_v16f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r4, r5} +; CHECK-NEXT: push {r4, r5} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vld20.32 {q2, q3}, [r1] +; CHECK-NEXT: mov r3, r1 +; CHECK-NEXT: vld20.32 {q0, q1}, [r0] +; CHECK-NEXT: mov r2, r0 +; CHECK-NEXT: vld21.32 {q2, q3}, [r3]! +; CHECK-NEXT: vld21.32 {q0, q1}, [r2]! +; CHECK-NEXT: vld20.32 {q4, q5}, [r3] +; CHECK-NEXT: vld20.32 {q6, q7}, [r2] +; CHECK-NEXT: vld21.32 {q4, q5}, [r3] +; CHECK-NEXT: vld21.32 {q6, q7}, [r2] +; CHECK-NEXT: vstmia sp, {d4, d5, d6, d7} @ 32-byte Spill +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmul.f32 q4, q2, q7 +; CHECK-NEXT: vneg.f32 q4, q4 +; CHECK-NEXT: vfma.f32 q4, q5, q6 +; CHECK-NEXT: vmul.f32 q5, q5, q7 +; CHECK-NEXT: vfma.f32 q5, q2, q6 +; CHECK-NEXT: vldmia sp, {d4, d5, d6, d7} @ 32-byte Reload +; CHECK-NEXT: vmul.f32 q6, q3, q1 +; CHECK-NEXT: vneg.f32 q6, q6 +; CHECK-NEXT: vmul.f32 q7, q2, q1 +; CHECK-NEXT: vfma.f32 q6, q2, q0 +; CHECK-NEXT: vfma.f32 q7, q3, q0 +; CHECK-NEXT: vst20.32 {q6, q7}, [r0] +; CHECK-NEXT: vst21.32 {q6, q7}, [r0] +; CHECK-NEXT: vst20.32 {q4, q5}, [r0] +; CHECK-NEXT: vst21.32 {q4, q5}, [r0] +; CHECK-NEXT: b .LBB3_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast float* %a to <16 x float>* + %b.ptr = bitcast float* %b to <16 x float>* + %c.ptr = bitcast float* %c to <16 x float>* + %a.val = load <16 x float>, <16 x float>* %a.ptr + %b.val = load <16 x float>, <16 x float>* %b.ptr + %strided.vec = shufflevector <16 x float> %a.val, <16 x float> poison, <8 x i32> + %strided.vec46 = shufflevector <16 x float> %a.val, <16 x float> poison, <8 x i32> + %strided.vec48 = shufflevector <16 x float> %b.val, <16 x float> poison, <8 x i32> + %strided.vec49 = shufflevector <16 x float> %b.val, <16 x float> poison, <8 x i32> + %0 = fmul fast <8 x float> %strided.vec48, %strided.vec + %1 = fmul fast <8 x float> %strided.vec49, %strided.vec46 + %2 = fsub fast <8 x float> %0, %1 + %3 = fmul fast <8 x float> %strided.vec49, %strided.vec + %4 = fmul fast <8 x float> %strided.vec48, %strided.vec46 + %5 = fadd fast <8 x float> %3, %4 + %6 = bitcast float* undef to <16 x float>* + %interleaved.vec = shufflevector <8 x float> %2, <8 x float> %5, <16 x i32> + store <16 x float> %interleaved.vec, <16 x float>* %6, align 4 + br label %vector.body +} diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f64-mul.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f64-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-f64-mul.ll @@ -0,0 +1,256 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s +; RUN: llc < %s -o /dev/null -stats -stats-json 2>&1 | FileCheck %s --check-prefix=STATS + +; NOTE: This statistic shouldn't appear, mve doesn't have f64 complex instructions +; STATS-NOT: "complex-arithmetic.NumComplexIntrinsics" + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_mul_v2f64(double* %a, double* %b, double* %c) #0 { +; CHECK-LABEL: complex_mul_v2f64: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmul.f64 d4, d3, d1 +; CHECK-NEXT: vmul.f64 d5, d2, d1 +; CHECK-NEXT: vfnms.f64 d4, d2, d0 +; CHECK-NEXT: vfma.f64 d5, d3, d0 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast double* %a to <2 x double>* + %b.ptr = bitcast double* %b to <2 x double>* + %c.ptr = bitcast double* %c to <2 x double>* + %a.val = load <2 x double>, <2 x double>* %a.ptr + %b.val = load <2 x double>, <2 x double>* %b.ptr + %strided.vec = shufflevector <2 x double> %a.val, <2 x double> poison, <1 x i32> + %strided.vec46 = shufflevector <2 x double> %a.val, <2 x double> poison, <1 x i32> + %strided.vec48 = shufflevector <2 x double> %b.val, <2 x double> poison, <1 x i32> + %strided.vec49 = shufflevector <2 x double> %b.val, <2 x double> poison, <1 x i32> + %0 = fmul fast <1 x double> %strided.vec48, %strided.vec + %1 = fmul fast <1 x double> %strided.vec49, %strided.vec46 + %2 = fsub fast <1 x double> %0, %1 + %3 = fmul fast <1 x double> %strided.vec49, %strided.vec + %4 = fmul fast <1 x double> %strided.vec48, %strided.vec46 + %5 = fadd fast <1 x double> %3, %4 + %6 = bitcast double* undef to <2 x double>* + %interleaved.vec = shufflevector <1 x double> %2, <1 x double> %5, <2 x i32> + store <2 x double> %interleaved.vec, <2 x double>* %6, align 4 + br label %vector.body +} + +define void @complex_mul_v4f64(double* %a, double* %b, double* %c) #0 { +; CHECK-LABEL: complex_mul_v4f64: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0, #16] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vmul.f64 d4, d3, d1 +; CHECK-NEXT: vmul.f64 d5, d2, d1 +; CHECK-NEXT: vfnms.f64 d4, d2, d0 +; CHECK-NEXT: vfma.f64 d5, d3, d0 +; CHECK-NEXT: vmul.f64 d0, d9, d7 +; CHECK-NEXT: vmul.f64 d1, d8, d7 +; CHECK-NEXT: vfnms.f64 d0, d8, d6 +; CHECK-NEXT: vfma.f64 d1, d9, d6 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast double* %a to <4 x double>* + %b.ptr = bitcast double* %b to <4 x double>* + %c.ptr = bitcast double* %c to <4 x double>* + %a.val = load <4 x double>, <4 x double>* %a.ptr + %b.val = load <4 x double>, <4 x double>* %b.ptr + %strided.vec = shufflevector <4 x double> %a.val, <4 x double> poison, <2 x i32> + %strided.vec46 = shufflevector <4 x double> %a.val, <4 x double> poison, <2 x i32> + %strided.vec48 = shufflevector <4 x double> %b.val, <4 x double> poison, <2 x i32> + %strided.vec49 = shufflevector <4 x double> %b.val, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %strided.vec48, %strided.vec + %1 = fmul fast <2 x double> %strided.vec49, %strided.vec46 + %2 = fsub fast <2 x double> %0, %1 + %3 = fmul fast <2 x double> %strided.vec49, %strided.vec + %4 = fmul fast <2 x double> %strided.vec48, %strided.vec46 + %5 = fadd fast <2 x double> %3, %4 + %6 = bitcast double* undef to <4 x double>* + %interleaved.vec = shufflevector <2 x double> %2, <2 x double> %5, <4 x i32> + store <4 x double> %interleaved.vec, <4 x double>* %6, align 4 + br label %vector.body +} + +define void @complex_mul_v8f64(double* %a, double* %b, double* %c) #0 { +; CHECK-LABEL: complex_mul_v8f64: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q4, [r1, #32] +; CHECK-NEXT: vmul.f64 d2, d5, d1 +; CHECK-NEXT: vmul.f64 d3, d4, d1 +; CHECK-NEXT: vfnms.f64 d2, d4, d0 +; CHECK-NEXT: vfma.f64 d3, d5, d0 +; CHECK-NEXT: vmul.f64 d0, d9, d7 +; CHECK-NEXT: vmul.f64 d1, d8, d7 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] +; CHECK-NEXT: vldrw.u32 q5, [r1, #48] +; CHECK-NEXT: vfnms.f64 d0, d8, d6 +; CHECK-NEXT: vfma.f64 d1, d9, d6 +; CHECK-NEXT: vldrw.u32 q4, [r0, #16] +; CHECK-NEXT: vldrw.u32 q6, [r1, #16] +; CHECK-NEXT: vmul.f64 d6, d11, d5 +; CHECK-NEXT: vmul.f64 d7, d10, d5 +; CHECK-NEXT: vfnms.f64 d6, d10, d4 +; CHECK-NEXT: vfma.f64 d7, d11, d4 +; CHECK-NEXT: vmul.f64 d4, d13, d9 +; CHECK-NEXT: vmul.f64 d5, d12, d9 +; CHECK-NEXT: vfnms.f64 d4, d12, d8 +; CHECK-NEXT: vfma.f64 d5, d13, d8 +; CHECK-NEXT: vstrw.32 q3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: b .LBB2_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast double* %a to <8 x double>* + %b.ptr = bitcast double* %b to <8 x double>* + %c.ptr = bitcast double* %c to <8 x double>* + %a.val = load <8 x double>, <8 x double>* %a.ptr + %b.val = load <8 x double>, <8 x double>* %b.ptr + %strided.vec = shufflevector <8 x double> %a.val, <8 x double> poison, <4 x i32> + %strided.vec46 = shufflevector <8 x double> %a.val, <8 x double> poison, <4 x i32> + %strided.vec48 = shufflevector <8 x double> %b.val, <8 x double> poison, <4 x i32> + %strided.vec49 = shufflevector <8 x double> %b.val, <8 x double> poison, <4 x i32> + %0 = fmul fast <4 x double> %strided.vec48, %strided.vec + %1 = fmul fast <4 x double> %strided.vec49, %strided.vec46 + %2 = fsub fast <4 x double> %0, %1 + %3 = fmul fast <4 x double> %strided.vec49, %strided.vec + %4 = fmul fast <4 x double> %strided.vec48, %strided.vec46 + %5 = fadd fast <4 x double> %3, %4 + %6 = bitcast double* undef to <8 x double>* + %interleaved.vec = shufflevector <4 x double> %2, <4 x double> %5, <8 x i32> + store <8 x double> %interleaved.vec, <8 x double>* %6, align 4 + br label %vector.body +} + +define void @complex_mul_v16f64(double* %a, double* %b, double* %c) #0 { +; CHECK-LABEL: complex_mul_v16f64: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmul.f64 d0, d5, d3 +; CHECK-NEXT: vmul.f64 d1, d4, d3 +; CHECK-NEXT: vfnms.f64 d0, d4, d2 +; CHECK-NEXT: vfma.f64 d1, d5, d2 +; CHECK-NEXT: vldrw.u32 q4, [r1, #16] +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d0, d9, d7 +; CHECK-NEXT: vmul.f64 d1, d8, d7 +; CHECK-NEXT: vfnms.f64 d0, d8, d6 +; CHECK-NEXT: vfma.f64 d1, d9, d6 +; CHECK-NEXT: vldrw.u32 q6, [r1, #32] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q7, [r1, #48] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d0, d13, d11 +; CHECK-NEXT: vmul.f64 d1, d12, d11 +; CHECK-NEXT: vfnms.f64 d0, d12, d10 +; CHECK-NEXT: vfma.f64 d1, d13, d10 +; CHECK-NEXT: vmul.f64 d6, d15, d9 +; CHECK-NEXT: vmul.f64 d7, d14, d9 +; CHECK-NEXT: vldrw.u32 q5, [r0, #64] +; CHECK-NEXT: vldrw.u32 q6, [r1, #64] +; CHECK-NEXT: vfnms.f64 d6, d14, d8 +; CHECK-NEXT: vfma.f64 d7, d15, d8 +; CHECK-NEXT: vldrw.u32 q7, [r0, #96] +; CHECK-NEXT: vldrw.u32 q1, [r1, #96] +; CHECK-NEXT: vmul.f64 d8, d13, d11 +; CHECK-NEXT: vmul.f64 d9, d12, d11 +; CHECK-NEXT: vfnms.f64 d8, d12, d10 +; CHECK-NEXT: vfma.f64 d9, d13, d10 +; CHECK-NEXT: vmul.f64 d10, d3, d15 +; CHECK-NEXT: vmul.f64 d11, d2, d15 +; CHECK-NEXT: vldrw.u32 q6, [r0, #112] +; CHECK-NEXT: vldrw.u32 q2, [r1, #112] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vfnms.f64 d10, d2, d14 +; CHECK-NEXT: vfma.f64 d11, d3, d14 +; CHECK-NEXT: vldrw.u32 q7, [r0, #80] +; CHECK-NEXT: vldrw.u32 q0, [r1, #80] +; CHECK-NEXT: vmul.f64 d2, d5, d13 +; CHECK-NEXT: vmul.f64 d3, d4, d13 +; CHECK-NEXT: vfnms.f64 d2, d4, d12 +; CHECK-NEXT: vfma.f64 d3, d5, d12 +; CHECK-NEXT: vmul.f64 d4, d1, d15 +; CHECK-NEXT: vmul.f64 d5, d0, d15 +; CHECK-NEXT: vfnms.f64 d4, d0, d14 +; CHECK-NEXT: vfma.f64 d5, d1, d14 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q5, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: vstrw.32 q4, [r0] +; CHECK-NEXT: vstrw.32 q3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: b .LBB3_1 +vector.ph: + br label %vector.body + +vector.body: + %a.ptr = bitcast double* %a to <16 x double>* + %b.ptr = bitcast double* %b to <16 x double>* + %c.ptr = bitcast double* %c to <16 x double>* + %a.val = load <16 x double>, <16 x double>* %a.ptr + %b.val = load <16 x double>, <16 x double>* %b.ptr + %strided.vec = shufflevector <16 x double> %a.val, <16 x double> poison, <8 x i32> + %strided.vec46 = shufflevector <16 x double> %a.val, <16 x double> poison, <8 x i32> + %strided.vec48 = shufflevector <16 x double> %b.val, <16 x double> poison, <8 x i32> + %strided.vec49 = shufflevector <16 x double> %b.val, <16 x double> poison, <8 x i32> + %0 = fmul fast <8 x double> %strided.vec48, %strided.vec + %1 = fmul fast <8 x double> %strided.vec49, %strided.vec46 + %2 = fsub fast <8 x double> %0, %1 + %3 = fmul fast <8 x double> %strided.vec49, %strided.vec + %4 = fmul fast <8 x double> %strided.vec48, %strided.vec46 + %5 = fadd fast <8 x double> %3, %4 + %6 = bitcast double* undef to <16 x double>* + %interleaved.vec = shufflevector <8 x double> %2, <8 x double> %5, <16 x i32> + store <16 x double> %interleaved.vec, <16 x double>* %6, align 4 + br label %vector.body +} diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-add.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-add.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_rotation_I() #0 { +; CHECK-LABEL: complex_rotation_I: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcadd.f32 q2, q0, q0, #90 +; CHECK-NEXT: vcadd.f32 q0, q1, q1, #90 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x float> %strided.vec41, %strided.vec39 + %1 = fadd fast <4 x float> %strided.vec42, %strided.vec + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} + +define void @complex_rotation_III() #0 { +; CHECK-LABEL: complex_rotation_III: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcadd.f32 q2, q0, q0, #270 +; CHECK-NEXT: vcadd.f32 q0, q1, q1, #270 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %0 = fadd fast <4 x float> %strided.vec42, %strided.vec + %1 = fsub fast <4 x float> %strided.vec39, %strided.vec41 + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} diff --git a/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-mul.ll b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/ComplexArithmetic/ARM/complex-arithmetic-rotations-mul.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -o - | FileCheck %s + +target triple = "arm-arm-none-eabi" +attributes #0 = { "target-cpu"="cortex-m55" } + +define void @complex_rotation() #0 { +; CHECK-LABEL: complex_rotation: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmul.f32 q2, q0, q0, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q0, #90 +; CHECK-NEXT: vcmul.f32 q0, q1, q1, #0 +; CHECK-NEXT: vcmla.f32 q0, q1, q1, #90 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: b .LBB0_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %0 = fmul fast <4 x float> %strided.vec41, %strided.vec + %1 = fmul fast <4 x float> %strided.vec42, %strided.vec39 + %2 = fsub fast <4 x float> %0, %1 + %3 = fmul fast <4 x float> %strided.vec42, %strided.vec + %4 = fmul fast <4 x float> %strided.vec41, %strided.vec39 + %5 = fadd fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %2, <4 x float> %5, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} + +define void @complex_rotation_I() #0 { +; CHECK-LABEL: complex_rotation_I: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r1, #16 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s1, s6 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vmul.f32 q2, q1, q1 +; CHECK-NEXT: vneg.f32 q3, q2 +; CHECK-NEXT: vmul.f32 q2, q0, q1 +; CHECK-NEXT: vneg.f32 q2, q2 +; CHECK-NEXT: vfma.f32 q3, q0, q0 +; CHECK-NEXT: vfms.f32 q2, q0, q1 +; CHECK-NEXT: vst20.32 {q2, q3}, [r0] +; CHECK-NEXT: vst21.32 {q2, q3}, [r0] +; CHECK-NEXT: b .LBB1_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %0 = fmul fast <4 x float> %strided.vec41, %strided.vec + %1 = fmul fast <4 x float> %strided.vec42, %strided.vec39 + %2 = fsub fast <4 x float> %0, %1 + %3 = fneg fast <4 x float> %strided.vec + %4 = fmul fast <4 x float> %strided.vec42, %3 + %5 = fmul fast <4 x float> %strided.vec41, %strided.vec39 + %6 = fsub fast <4 x float> %4, %5 + %interleaved.vec = shufflevector <4 x float> %6, <4 x float> %2, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} + + +define void @complex_rotation_II() #0 { +; CHECK-LABEL: complex_rotation_II: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #16 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vcmul.f32 q2, q0, q0, #180 +; CHECK-NEXT: vcmla.f32 q2, q0, q0, #270 +; CHECK-NEXT: vcmul.f32 q0, q1, q1, #180 +; CHECK-NEXT: vcmla.f32 q0, q1, q1, #270 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [r1] +; CHECK-NEXT: b .LBB2_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39.neg = fneg fast <4 x float> %strided.vec39 + %0 = fmul fast <4 x float> %strided.vec41, %strided.vec39.neg + %1 = fmul fast <4 x float> %strided.vec42, %strided.vec + %2 = fsub fast <4 x float> %0, %1 + %3 = fmul fast <4 x float> %strided.vec42, %strided.vec39 + %4 = fmul fast <4 x float> %strided.vec41, %strided.vec + %5 = fsub fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +} + +define void @complex_rotation_III() #0 { +; CHECK-LABEL: complex_rotation_III: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r1, #16 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s1, s6 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmov.f32 s6, s9 +; CHECK-NEXT: vmov.f32 s7, s11 +; CHECK-NEXT: vmov.f32 s2, s8 +; CHECK-NEXT: vmov.f32 s3, s10 +; CHECK-NEXT: vmul.f32 q2, q1, q1 +; CHECK-NEXT: vneg.f32 q3, q2 +; CHECK-NEXT: vmul.f32 q2, q0, q1 +; CHECK-NEXT: vfma.f32 q3, q0, q0 +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vst20.32 {q2, q3}, [r0] +; CHECK-NEXT: vst21.32 {q2, q3}, [r0] +; CHECK-NEXT: b .LBB3_1 +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %store.ptr = bitcast { float, float }* null to <8 x float>* + %load.vec1 = load <8 x float>, <8 x float>* null, align 4 + %load.vec2 = load <8 x float>, <8 x float>* null, align 4 + %strided.vec = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec39 = shufflevector <8 x float> %load.vec1, <8 x float> zeroinitializer, <4 x i32> + %strided.vec41 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %strided.vec42 = shufflevector <8 x float> %load.vec2, <8 x float> zeroinitializer, <4 x i32> + %0 = fmul fast <4 x float> %strided.vec41, %strided.vec + %1 = fmul fast <4 x float> %strided.vec42, %strided.vec39 + %2 = fsub fast <4 x float> %0, %1 + %3 = fmul fast <4 x float> %strided.vec42, %strided.vec + %4 = fmul fast <4 x float> %strided.vec41, %strided.vec39 + %5 = fadd fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> + store <8 x float> %interleaved.vec, <8 x float>* %store.ptr, align 4 + br label %vector.body +}