diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -0,0 +1,53 @@ +//===- ComplexDeinterleavingPass.h - Complex Deinterleaving Pass *- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements generation of target-specific intrinsics to support +// handling of complex number arithmetic and deinterleaving. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H +#define LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H + +#include "llvm/IR/PassManager.h" +#include "llvm/IR/PatternMatch.h" + +namespace llvm { + +class Function; +class TargetMachine; + +struct ComplexDeinterleavingPass + : public PassInfoMixin { +private: + TargetMachine *TM; + +public: + ComplexDeinterleavingPass(TargetMachine *TM) : TM(TM) {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +enum class ComplexDeinterleavingOperation { + CAdd, + CMulPartial, + // The following 'operations' are used to represent internal states. Backends + // are not expected to try and support these in any capacity. + Shuffle +}; + +enum class ComplexDeinterleavingRotation { + Rotation_0 = 0, + Rotation_90 = 1, + Rotation_180 = 2, + Rotation_270 = 3, +}; + +} // namespace llvm + +#endif // LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -79,6 +79,10 @@ /// matching during instruction selection. FunctionPass *createCodeGenPreparePass(); + /// This pass implements generation of target-specific intrinsics to support + /// handling of complex number arithmetic + FunctionPass *createComplexDeinterleavingPass(const TargetMachine *TM); + /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg /// load-linked/store-conditional loops. extern char &AtomicExpandID; diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -27,6 +27,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/ComplexDeinterleavingPass.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/LowLevelType.h" @@ -3103,6 +3104,26 @@ return isOperationLegalOrCustom(Op, VT); } + /// Does this target support complex deinterleaving + virtual bool isComplexDeinterleavingSupported() const { return false; } + + /// Does this target support complex deinterleaving with the given operation + /// and type + virtual bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + return false; + } + + /// Create the IR node for the given complex deinterleaving operation. + /// If one cannot be created using all the given inputs, nullptr should be + /// returned. + virtual Value *createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const { + return nullptr; + } + //===--------------------------------------------------------------------===// // Runtime Library hooks // diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -102,6 +102,7 @@ void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); void initializeCheckDebugMachineModulePass(PassRegistry &); void initializeCodeGenPreparePass(PassRegistry&); +void initializeComplexDeinterleavingLegacyPassPass(PassRegistry&); void initializeConstantHoistingLegacyPassPass(PassRegistry&); void initializeConstantMergeLegacyPassPass(PassRegistry&); void initializeConstraintEliminationPass(PassRegistry &); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -43,6 +43,7 @@ CodeGenPassBuilder.cpp CodeGenPrepare.cpp CommandFlags.cpp + ComplexDeinterleavingPass.cpp CriticalAntiDepBreaker.cpp DeadMachineInstructionElim.cpp DetectDeadLanes.cpp diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -0,0 +1,877 @@ +//===- ComplexDeinterleavingPass.cpp --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Identification: +// This step is responsible for finding the patterns that can be lowered to +// complex instructions, and building a graph to represent the complex +// structures. Starting from the "Converging Shuffle" (a shuffle that +// reinterleaves the complex components, with a mask of <0, 2, 1, 3>), the +// operands are evaluated and identified as "Composite Nodes" (collections of +// instructions that can potentially be lowered to a single complex +// instruction). This is performed by checking the real and imaginary components +// and tracking the data flow for each component while following the operand +// pairs. Validity of each node is expected to be done upon creation, and any +// validation errors should halt traversal and prevent further graph +// construction. +// +// Replacement: +// This step traverses the graph built up by identification, delegating to the +// target to validate and generate the correct intrinsics, and plumbs them +// together connecting each end of the new intrinsics graph to the existing +// use-def chain. This step is assumed to finish successfully, as all +// information is expected to be correct by this point. +// +// +// Internal data structure: +// ComplexDeinterleavingGraph: +// Keeps references to all the valid CompositeNodes formed as part of the +// transformation, and every Instruction contained within said nodes. It also +// holds onto a reference to the root Instruction, and the root node that should +// replace it. +// +// ComplexDeinterleavingCompositeNode: +// A CompositeNode represents a single transformation point; each node should +// transform into a single complex instruction (ignoring vector splitting, which +// would generate more instructions per node). They are identified in a +// depth-first manner, traversing and identifying the operands of each +// instruction in the order they appear in the IR. +// Each node maintains a reference to its Real and Imaginary instructions, +// as well as any additional instructions that make up the identified operation +// (Internal instructions should only have uses within their containing node). +// A Node also contains the rotation and operation type that it represents. +// Operands contains pointers to other CompositeNodes, acting as the edges in +// the graph. ReplacementValue is the transformed Value* that has been emitted +// to the IR. +// +// Note: If the operation of a Node is Shuffle, only the Real, Imaginary, and +// ReplacementValue fields of that Node are relevant, where the ReplacementValue +// should be pre-populated. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ComplexDeinterleavingPass.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Local.h" +#include + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "complex-deinterleaving" + +STATISTIC(NumComplexTransformations, "Amount of complex patterns transformed"); + +static cl::opt ComplexDeinterleavingEnabled( + "enable-complex-deinterleaving", + cl::desc("Enable generation of complex instructions"), cl::init(true), + cl::Hidden); + +/// Checks the given mask, and determines whether said mask is interleaving. +/// +/// To be interleaving, a mask must alternate between `i` and `i + (Length / +/// 2)`, and must contain all numbers within the range of `[0..Length)` (e.g. a +/// 4x vector interleaving mask would be <0, 2, 1, 3>). +static bool isInterleavingMask(ArrayRef Mask); + +/// Checks the given mask, and determines whether said mask is deinterleaving. +/// +/// To be deinterleaving, a mask must increment in steps of 2, and either start +/// with 0 or 1. +/// (e.g. an 8x vector deinterleaving mask would be either <0, 2, 4, 6> or +/// <1, 3, 5, 7>). +static bool isDeinterleavingMask(ArrayRef Mask); + +namespace { + +class ComplexDeinterleavingLegacyPass : public FunctionPass { +public: + static char ID; + + ComplexDeinterleavingLegacyPass(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) { + initializeComplexDeinterleavingLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Complex Deinterleaving Pass"; + } + + bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } + +private: + const TargetMachine *TM; +}; + +class ComplexDeinterleavingGraph; +struct ComplexDeinterleavingCompositeNode { + + ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op, + Instruction *R, Instruction *I) + : Operation(Op), Real(R), Imag(I) {} + +private: + friend class ComplexDeinterleavingGraph; + using NodePtr = std::shared_ptr; + using RawNodePtr = ComplexDeinterleavingCompositeNode *; + +public: + ComplexDeinterleavingOperation Operation; + Instruction *Real; + Instruction *Imag; + + // Instructions that should only exist within this node, there should be no + // users of these instructions outside the node. An example of these would be + // the multiply instructions of a partial multiply operation. + SmallVector InternalInstructions; + ComplexDeinterleavingRotation Rotation; + SmallVector Operands; + Value *ReplacementNode = nullptr; + + void addInstruction(Instruction *I) { InternalInstructions.push_back(I); } + void addOperand(NodePtr Node) { Operands.push_back(Node.get()); } + + bool hasAllInternalUses(SmallPtrSet &AllInstructions); + + void dump() { dump(dbgs()); } + void dump(raw_ostream &OS) { + auto PrintValue = [&](Value *V) { + if (V) { + OS << "\""; + V->print(OS, true); + OS << "\"\n"; + } else + OS << "nullptr\n"; + }; + auto PrintNodeRef = [&](RawNodePtr Ptr) { + if (Ptr) + OS << Ptr << "\n"; + else + OS << "nullptr\n"; + }; + + OS << "- CompositeNode: " << this << "\n"; + OS << " Real: "; + PrintValue(Real); + OS << " Imag: "; + PrintValue(Imag); + OS << " ReplacementNode: "; + PrintValue(ReplacementNode); + OS << " Operation: " << (int)Operation << "\n"; + OS << " Rotation: " << ((int)Rotation * 90) << "\n"; + OS << " Operands: \n"; + for (const auto &Op : Operands) { + OS << " - "; + PrintNodeRef(Op); + } + OS << " InternalInstructions:\n"; + for (const auto &I : InternalInstructions) { + OS << " - \""; + I->print(OS, true); + OS << "\"\n"; + } + } +}; + +class ComplexDeinterleavingGraph { +public: + using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr; + using RawNodePtr = ComplexDeinterleavingCompositeNode::RawNodePtr; + explicit ComplexDeinterleavingGraph(const TargetLowering *tl) : TL(tl) {} + +private: + const TargetLowering *TL; + Instruction *RootValue; + NodePtr RootNode; + SmallVector CompositeNodes; + SmallPtrSet AllInstructions; + + NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation, + Instruction *R, Instruction *I) { + return std::make_shared(Operation, R, + I); + } + + NodePtr submitCompositeNode(NodePtr Node) { + CompositeNodes.push_back(Node); + AllInstructions.insert(Node->Real); + AllInstructions.insert(Node->Imag); + for (auto *I : Node->InternalInstructions) + AllInstructions.insert(I); + return Node; + } + + NodePtr getContainingComposite(Value *R, Value *I) { + for (const auto &CN : CompositeNodes) { + if (CN->Real == R && CN->Imag == I) + return CN; + } + return nullptr; + } + + /// Identifies a complex partial multiply pattern and its rotation, based on + /// the following patterns + /// + /// 0: r: cr + ar * br + /// i: ci + ar * bi + /// 90: r: cr - ai * bi + /// i: ci + ai * br + /// 180: r: cr - ar * br + /// i: ci - ar * bi + /// 270: r: cr + ai * bi + /// i: ci - ai * br + NodePtr identifyPartialMul(Instruction *Real, Instruction *Imag); + + /// Identify the other branch of a Partial Mul, taking the CommonOperandI that + /// is partially known from identifyPartialMul, filling in the other half of + /// the complex pair. + NodePtr identifyNodeWithImplicitAdd( + Instruction *I, Instruction *J, + std::pair &CommonOperandI); + + /// Identifies a complex add pattern and its rotation, based on the following + /// patterns. + /// + /// 90: r: ar - bi + /// i: ai + br + /// 270: r: ar + bi + /// i: ai - br + NodePtr identifyAdd(Instruction *Real, Instruction *Imag); + + NodePtr identifyNode(Instruction *I, Instruction *J); + + Value *replaceNode(RawNodePtr Node); + +public: + void dump() { dump(dbgs()); } + void dump(raw_ostream &OS) { + for (const auto &Node : CompositeNodes) + Node->dump(OS); + } + + /// Returns false if the deinterleaving operation should be cancelled for the + /// current graph. + bool identifyNodes(Instruction *RootI); + + /// Perform the actual replacement of the underlying instruction graph. + /// Returns false if the deinterleaving operation should be cancelled for the + /// current graph. + void replaceNodes(); +}; + +class ComplexDeinterleaving { +public: + ComplexDeinterleaving(const TargetLowering *tl, const TargetLibraryInfo *tli) + : TL(tl), TLI(tli) {} + bool runOnFunction(Function &F); + +private: + bool evaluateBasicBlock(BasicBlock *B); + + const TargetLowering *TL = nullptr; + const TargetLibraryInfo *TLI = nullptr; +}; + +} // namespace + +char ComplexDeinterleavingLegacyPass::ID = 0; + +INITIALIZE_PASS_BEGIN(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, + "Complex Deinterleaving", false, false) +INITIALIZE_PASS_END(ComplexDeinterleavingLegacyPass, DEBUG_TYPE, + "Complex Deinterleaving", false, false) + +PreservedAnalyses ComplexDeinterleavingPass::run(Function &F, + FunctionAnalysisManager &AM) { + const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + auto &TLI = AM.getResult(F); + if (!ComplexDeinterleaving(TL, &TLI).runOnFunction(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve(); + return PA; +} + +FunctionPass *llvm::createComplexDeinterleavingPass(const TargetMachine *TM) { + return new ComplexDeinterleavingLegacyPass(TM); +} + +bool ComplexDeinterleavingLegacyPass::runOnFunction(Function &F) { + const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + auto TLI = getAnalysis().getTLI(F); + return ComplexDeinterleaving(TL, &TLI).runOnFunction(F); +} + +bool ComplexDeinterleaving::runOnFunction(Function &F) { + if (!ComplexDeinterleavingEnabled) { + LLVM_DEBUG( + dbgs() << "Complex deinterleaving has been explicitly disabled.\n"); + return false; + } + + if (!TL->isComplexDeinterleavingSupported()) { + LLVM_DEBUG( + dbgs() << "Complex deinterleaving has been disabled, target does " + "not support lowering of complex number operations.\n"); + return false; + } + + bool Changed = false; + for (auto &B : F) + Changed |= evaluateBasicBlock(&B); + + return Changed; +} + +static bool isInterleavingMask(ArrayRef Mask) { + // If the size is not even, it's not an interleaving mask + if ((Mask.size() & 1)) + return false; + + int HalfNumElements = Mask.size() / 2; + for (int Idx = 0; Idx < HalfNumElements; ++Idx) { + int MaskIdx = Idx * 2; + if (Mask[MaskIdx] != Idx || Mask[MaskIdx + 1] != (Idx + HalfNumElements)) + return false; + } + + return true; +} + +static bool isDeinterleavingMask(ArrayRef Mask) { + int Offset = Mask[0]; + int HalfNumElements = Mask.size() / 2; + + for (int Idx = 1; Idx < HalfNumElements; ++Idx) { + if (Mask[Idx] != (Idx * 2) + Offset) + return false; + } + + return true; +} + +bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) { + bool Changed = false; + + SmallVector DeadInstrRoots; + + for (auto &I : *B) { + auto *SVI = dyn_cast(&I); + if (!SVI) + continue; + + // Look for a shufflevector that takes separate vectors of the real and + // imaginary components and recombines them into a single vector. + if (!isInterleavingMask(SVI->getShuffleMask())) + continue; + + ComplexDeinterleavingGraph Graph(TL); + if (!Graph.identifyNodes(SVI)) + continue; + + Graph.replaceNodes(); + DeadInstrRoots.push_back(SVI); + Changed = true; + } + + for (const auto &I : DeadInstrRoots) { + if (!I || I->getParent() == nullptr) + continue; + llvm::RecursivelyDeleteTriviallyDeadInstructions(I, TLI); + } + + return Changed; +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyNodeWithImplicitAdd( + Instruction *Real, Instruction *Imag, + std::pair &PartialMatch) { + LLVM_DEBUG(dbgs() << "identifyNodeWithImplicitAdd " << *Real << " / " << *Imag + << "\n"); + + if (!Real->hasOneUse() || !Imag->hasOneUse()) { + LLVM_DEBUG(dbgs() << " - Mul operand has multiple uses.\n"); + return nullptr; + } + + if (Real->getOpcode() != Instruction::FMul || + Imag->getOpcode() != Instruction::FMul) { + LLVM_DEBUG(dbgs() << " - Real or imaginary instruction is not fmul\n"); + return nullptr; + } + + Instruction *R0 = dyn_cast(Real->getOperand(0)); + Instruction *R1 = dyn_cast(Real->getOperand(1)); + Instruction *I0 = dyn_cast(Imag->getOperand(0)); + Instruction *I1 = dyn_cast(Imag->getOperand(1)); + if (!R0 || !R1 || !I0 || !I1) { + LLVM_DEBUG(dbgs() << " - Mul operand not Instruction\n"); + return nullptr; + } + + // A +/+ has a rotation of 0. If any of the operands are fneg, we flip the + // rotations and use the operand. + unsigned Negs = 0; + SmallVector FNegs; + if (R0->getOpcode() == Instruction::FNeg || + R1->getOpcode() == Instruction::FNeg) { + Negs |= 1; + if (R0->getOpcode() == Instruction::FNeg) { + FNegs.push_back(R0); + R0 = dyn_cast(R0->getOperand(0)); + } else { + FNegs.push_back(R1); + R1 = dyn_cast(R1->getOperand(0)); + } + if (!R0 || !R1) + return nullptr; + } + if (I0->getOpcode() == Instruction::FNeg || + I1->getOpcode() == Instruction::FNeg) { + Negs |= 2; + Negs ^= 1; + if (I0->getOpcode() == Instruction::FNeg) { + FNegs.push_back(I0); + I0 = dyn_cast(I0->getOperand(0)); + } else { + FNegs.push_back(I1); + I1 = dyn_cast(I1->getOperand(0)); + } + if (!I0 || !I1) + return nullptr; + } + + ComplexDeinterleavingRotation Rotation = (ComplexDeinterleavingRotation)Negs; + + Instruction *CommonOperand; + Instruction *UncommonRealOp; + Instruction *UncommonImagOp; + + if (R0 == I0 || R0 == I1) { + CommonOperand = R0; + UncommonRealOp = R1; + } else if (R1 == I0 || R1 == I1) { + CommonOperand = R1; + UncommonRealOp = R0; + } else { + LLVM_DEBUG(dbgs() << " - No equal operand\n"); + return nullptr; + } + + UncommonImagOp = (CommonOperand == I0) ? I1 : I0; + if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || + Rotation == ComplexDeinterleavingRotation::Rotation_270) + std::swap(UncommonRealOp, UncommonImagOp); + + // Between identifyPartialMul and here we need to have found a complete valid + // pair from the CommonOperand of each part. + if (Rotation == ComplexDeinterleavingRotation::Rotation_0 || + Rotation == ComplexDeinterleavingRotation::Rotation_180) + PartialMatch.first = CommonOperand; + else + PartialMatch.second = CommonOperand; + + if (!PartialMatch.first || !PartialMatch.second) { + LLVM_DEBUG(dbgs() << " - Incomplete partial match\n"); + return nullptr; + } + + NodePtr CommonNode = identifyNode(PartialMatch.first, PartialMatch.second); + if (!CommonNode) { + LLVM_DEBUG(dbgs() << " - No CommonNode identified\n"); + return nullptr; + } + + NodePtr UncommonNode = identifyNode(UncommonRealOp, UncommonImagOp); + if (!UncommonNode) { + LLVM_DEBUG(dbgs() << " - No UncommonNode identified\n"); + return nullptr; + } + + NodePtr Node = prepareCompositeNode( + ComplexDeinterleavingOperation::CMulPartial, Real, Imag); + Node->Rotation = Rotation; + Node->addOperand(CommonNode); + Node->addOperand(UncommonNode); + Node->InternalInstructions.append(FNegs); + return submitCompositeNode(Node); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real, + Instruction *Imag) { + LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag + << "\n"); + // Determine rotation + ComplexDeinterleavingRotation Rotation; + if (Real->getOpcode() == Instruction::FAdd && + Imag->getOpcode() == Instruction::FAdd) + Rotation = ComplexDeinterleavingRotation::Rotation_0; + else if (Real->getOpcode() == Instruction::FSub && + Imag->getOpcode() == Instruction::FAdd) + Rotation = ComplexDeinterleavingRotation::Rotation_90; + else if (Real->getOpcode() == Instruction::FSub && + Imag->getOpcode() == Instruction::FSub) + Rotation = ComplexDeinterleavingRotation::Rotation_180; + else if (Real->getOpcode() == Instruction::FAdd && + Imag->getOpcode() == Instruction::FSub) + Rotation = ComplexDeinterleavingRotation::Rotation_270; + else { + LLVM_DEBUG(dbgs() << " - Unhandled rotation.\n"); + return nullptr; + } + + if (!Real->getFastMathFlags().allowContract() || + !Imag->getFastMathFlags().allowContract()) { + LLVM_DEBUG(dbgs() << " - Contract is missing from the FastMath flags.\n"); + return nullptr; + } + + Value *CR = Real->getOperand(0); + Instruction *RealMulI = dyn_cast(Real->getOperand(1)); + if (!RealMulI) + return nullptr; + Value *CI = Imag->getOperand(0); + Instruction *ImagMulI = dyn_cast(Imag->getOperand(1)); + if (!ImagMulI) + return nullptr; + + if (!RealMulI->hasOneUse() || !ImagMulI->hasOneUse()) { + LLVM_DEBUG(dbgs() << " - Mul instruction has multiple uses\n"); + return nullptr; + } + + Instruction *R0 = dyn_cast(RealMulI->getOperand(0)); + Instruction *R1 = dyn_cast(RealMulI->getOperand(1)); + Instruction *I0 = dyn_cast(ImagMulI->getOperand(0)); + Instruction *I1 = dyn_cast(ImagMulI->getOperand(1)); + if (!R0 || !R1 || !I0 || !I1) { + LLVM_DEBUG(dbgs() << " - Mul operand not Instruction\n"); + return nullptr; + } + + Instruction *CommonOperand; + Instruction *UncommonRealOp; + Instruction *UncommonImagOp; + + if (R0 == I0 || R0 == I1) { + CommonOperand = R0; + UncommonRealOp = R1; + } else if (R1 == I0 || R1 == I1) { + CommonOperand = R1; + UncommonRealOp = R0; + } else { + LLVM_DEBUG(dbgs() << " - No equal operand\n"); + return nullptr; + } + + UncommonImagOp = (CommonOperand == I0) ? I1 : I0; + if (Rotation == ComplexDeinterleavingRotation::Rotation_90 || + Rotation == ComplexDeinterleavingRotation::Rotation_270) + std::swap(UncommonRealOp, UncommonImagOp); + + std::pair PartialMatch( + (Rotation == ComplexDeinterleavingRotation::Rotation_0 || + Rotation == ComplexDeinterleavingRotation::Rotation_180) + ? CommonOperand + : nullptr, + (Rotation == ComplexDeinterleavingRotation::Rotation_90 || + Rotation == ComplexDeinterleavingRotation::Rotation_270) + ? CommonOperand + : nullptr); + NodePtr CNode = identifyNodeWithImplicitAdd( + cast(CR), cast(CI), PartialMatch); + if (!CNode) { + LLVM_DEBUG(dbgs() << " - No cnode identified\n"); + return nullptr; + } + + NodePtr UncommonRes = identifyNode(UncommonRealOp, UncommonImagOp); + if (!UncommonRes) { + LLVM_DEBUG(dbgs() << " - No UncommonRes identified\n"); + return nullptr; + } + + assert(PartialMatch.first && PartialMatch.second); + NodePtr CommonRes = identifyNode(PartialMatch.first, PartialMatch.second); + if (!CommonRes) { + LLVM_DEBUG(dbgs() << " - No CommonRes identified\n"); + return nullptr; + } + + NodePtr Node = prepareCompositeNode( + ComplexDeinterleavingOperation::CMulPartial, Real, Imag); + Node->addInstruction(RealMulI); + Node->addInstruction(ImagMulI); + Node->Rotation = Rotation; + Node->addOperand(CommonRes); + Node->addOperand(UncommonRes); + Node->addOperand(CNode); + return submitCompositeNode(Node); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyAdd(Instruction *Real, Instruction *Imag) { + LLVM_DEBUG(dbgs() << "identifyAdd " << *Real << " / " << *Imag << "\n"); + + // Determine rotation + ComplexDeinterleavingRotation Rotation; + if (Real->getOpcode() == Instruction::FSub && + Imag->getOpcode() == Instruction::FAdd) + Rotation = ComplexDeinterleavingRotation::Rotation_90; + else if (Real->getOpcode() == Instruction::FAdd && + Imag->getOpcode() == Instruction::FSub) + Rotation = ComplexDeinterleavingRotation::Rotation_270; + else { + LLVM_DEBUG(dbgs() << " - Unhandled case, rotation is not assigned.\n"); + return nullptr; + } + + auto *AR = cast(Real->getOperand(0)); + auto *BI = cast(Real->getOperand(1)); + auto *AI = cast(Imag->getOperand(0)); + auto *BR = cast(Imag->getOperand(1)); + + NodePtr ResA = identifyNode(AR, AI); + if (!ResA) { + LLVM_DEBUG(dbgs() << " - AR/AI is not identified as a composite node.\n"); + return nullptr; + } + NodePtr ResB = identifyNode(BR, BI); + if (!ResB) { + LLVM_DEBUG(dbgs() << " - BR/BI is not identified as a composite node.\n"); + return nullptr; + } + + NodePtr Node = + prepareCompositeNode(ComplexDeinterleavingOperation::CAdd, Real, Imag); + Node->Rotation = Rotation; + Node->addOperand(ResA); + Node->addOperand(ResB); + return submitCompositeNode(Node); +} + +static bool isInstructionPairAdd(Instruction *A, Instruction *B) { + unsigned OpcA = A->getOpcode(); + unsigned OpcB = B->getOpcode(); + return (OpcA == Instruction::FSub && OpcB == Instruction::FAdd) || + (OpcA == Instruction::FAdd && OpcB == Instruction::FSub); +} + +static bool isInstructionPairMul(Instruction *A, Instruction *B) { + auto Pattern = + m_BinOp(m_FMul(m_Value(), m_Value()), m_FMul(m_Value(), m_Value())); + + return match(A, Pattern) && match(B, Pattern); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) { + LLVM_DEBUG(dbgs() << "identifyNode on " << *Real << " / " << *Imag << "\n"); + if (NodePtr CN = getContainingComposite(Real, Imag)) { + LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); + return CN; + } + + auto *RealShuffle = dyn_cast(Real); + auto *ImagShuffle = dyn_cast(Imag); + if (RealShuffle && ImagShuffle) { + Value *RealOp1 = RealShuffle->getOperand(1); + if (!isa(RealOp1) && !isa(RealOp1)) { + LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n"); + return nullptr; + } + Value *ImagOp1 = ImagShuffle->getOperand(1); + if (!isa(ImagOp1) && !isa(ImagOp1)) { + LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n"); + return nullptr; + } + + Value *RealOp0 = RealShuffle->getOperand(0); + Value *ImagOp0 = ImagShuffle->getOperand(0); + + if (RealOp0 != ImagOp0) { + LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n"); + return nullptr; + } + + ArrayRef RealMask = RealShuffle->getShuffleMask(); + ArrayRef ImagMask = ImagShuffle->getShuffleMask(); + if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) { + LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n"); + return nullptr; + } + + if (RealMask[0] != 0 || ImagMask[0] != 1) { + LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n"); + return nullptr; + } + + // Type checking, the shuffle type should be a vector type of the same + // scalar type, but half the size + auto CheckType = [&](ShuffleVectorInst *Shuffle) { + Value *Op = Shuffle->getOperand(0); + auto *ShuffleTy = cast(Shuffle->getType()); + auto *OpTy = cast(Op->getType()); + + if (OpTy->getScalarType() != ShuffleTy->getScalarType()) + return false; + if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements()) + return false; + + return true; + }; + + auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool { + if (!CheckType(Shuffle)) + return false; + + ArrayRef Mask = Shuffle->getShuffleMask(); + int Last = *Mask.rbegin(); + + Value *Op = Shuffle->getOperand(0); + auto *OpTy = cast(Op->getType()); + int NumElements = OpTy->getNumElements(); + + // Ensure that the deinterleaving shuffle only pulls from the first + // shuffle operand. + return Last < NumElements; + }; + + if (RealShuffle->getType() != ImagShuffle->getType()) { + LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n"); + return nullptr; + } + if (!CheckDeinterleavingShuffle(RealShuffle)) { + LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n"); + return nullptr; + } + if (!CheckDeinterleavingShuffle(ImagShuffle)) { + LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n"); + return nullptr; + } + + NodePtr PlaceholderNode = + prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Shuffle, + RealShuffle, ImagShuffle); + PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0); + return submitCompositeNode(PlaceholderNode); + } + if (RealShuffle || ImagShuffle) + return nullptr; + + auto *VTy = cast(Real->getType()); + auto *NewVTy = + FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2); + + if (TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CMulPartial, NewVTy) && + isInstructionPairMul(Real, Imag)) { + return identifyPartialMul(Real, Imag); + } + + if (TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CAdd, NewVTy) && + isInstructionPairAdd(Real, Imag)) { + return identifyAdd(Real, Imag); + } + + return nullptr; +} + +bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { + Instruction *Real; + Instruction *Imag; + if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag)))) + return false; + + RootValue = RootI; + AllInstructions.insert(RootI); + RootNode = identifyNode(Real, Imag); + + LLVM_DEBUG({ + Function *F = RootI->getFunction(); + BasicBlock *B = RootI->getParent(); + dbgs() << "Complex deinterleaving graph for " << F->getName() + << "::" << B->getName() << ".\n"; + dump(dbgs()); + dbgs() << "\n"; + }); + + // Check all instructions have internal uses + for (const auto &Node : CompositeNodes) { + if (!Node->hasAllInternalUses(AllInstructions)) { + LLVM_DEBUG(dbgs() << " - Invalid internal uses\n"); + return false; + } + } + return RootNode != nullptr; +} + +Value *ComplexDeinterleavingGraph::replaceNode( + ComplexDeinterleavingGraph::RawNodePtr Node) { + if (Node->ReplacementNode) + return Node->ReplacementNode; + + Value *Input0 = replaceNode(Node->Operands[0]); + Value *Input1 = replaceNode(Node->Operands[1]); + Value *Accumulator = + Node->Operands.size() > 2 ? replaceNode(Node->Operands[2]) : nullptr; + + assert(Input0->getType() == Input1->getType() && + "Node inputs need to be of the same type"); + + Node->ReplacementNode = TL->createComplexDeinterleavingIR( + Node->Real, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); + + assert(Node->ReplacementNode && "Target failed to create Intrinsic call."); + NumComplexTransformations += 1; + return Node->ReplacementNode; +} + +void ComplexDeinterleavingGraph::replaceNodes() { + Value *R = replaceNode(RootNode.get()); + assert(R && "Unable to find replacement for RootValue"); + RootValue->replaceAllUsesWith(R); +} + +bool ComplexDeinterleavingCompositeNode::hasAllInternalUses( + SmallPtrSet &AllInstructions) { + if (Operation == ComplexDeinterleavingOperation::Shuffle) + return true; + + for (auto *User : Real->users()) { + if (!AllInstructions.contains(cast(User))) + return false; + } + for (auto *User : Imag->users()) { + if (!AllInstructions.contains(cast(User))) + return false; + } + for (auto *I : InternalInstructions) { + for (auto *User : I->users()) { + if (!AllInstructions.contains(cast(User))) + return false; + } + } + return true; +} \ No newline at end of file diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -736,6 +736,15 @@ bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; + bool isComplexDeinterleavingSupported() const override; + bool isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const override; + + Value *createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, + Value *Accumulator = nullptr) const override; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21833,3 +21833,97 @@ MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } + +bool ARMTargetLowering::isComplexDeinterleavingSupported() const { + return Subtarget->hasMVEFloatOps(); +} + +bool ARMTargetLowering::isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + auto *VTy = dyn_cast(Ty); + if (!VTy) + return false; + + auto *ScalarTy = VTy->getScalarType(); + unsigned NumElements = VTy->getNumElements(); + + unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; + if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth)) + return false; + + // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32 + return ScalarTy->isHalfTy() || ScalarTy->isFloatTy(); +} + +Value *ARMTargetLowering::createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, + Value *Accumulator) const { + + FixedVectorType *Ty = cast(InputA->getType()); + + IRBuilder<> B(I); + + unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); + + assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits"); + + if (TyWidth > 128) { + int Stride = Ty->getNumElements() / 2; + auto SplitSeq = llvm::seq(0, Ty->getNumElements()); + auto SplitSeqVec = llvm::to_vector(SplitSeq); + ArrayRef LowerSplitMask(&SplitSeqVec[0], Stride); + ArrayRef UpperSplitMask(&SplitSeqVec[Stride], Stride); + + auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask); + auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask); + auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask); + auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask); + Value *LowerSplitAcc = nullptr; + Value *UpperSplitAcc = nullptr; + + if (Accumulator) { + LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask); + UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask); + } + + auto *LowerSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + auto *UpperSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + + ArrayRef JoinMask(&SplitSeqVec[0], Ty->getNumElements()); + return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); + } + + auto *IntTy = Type::getInt32Ty(B.getContext()); + + ConstantInt *ConstRotation = nullptr; + if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + ConstRotation = ConstantInt::get(IntTy, (int)Rotation); + + if (Accumulator) + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty, + {ConstRotation, Accumulator, InputB, InputA}); + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty, + {ConstRotation, InputB, InputA}); + } + + if (OperationType == ComplexDeinterleavingOperation::CAdd) { + // 1 means the value is not halved. + auto *ConstHalving = ConstantInt::get(IntTy, 1); + + if (Rotation == ComplexDeinterleavingRotation::Rotation_90) + ConstRotation = ConstantInt::get(IntTy, 0); + else if (Rotation == ComplexDeinterleavingRotation::Rotation_270) + ConstRotation = ConstantInt::get(IntTy, 1); + + if (!ConstRotation) + return nullptr; // Invalid rotation for arm_mve_vcaddq + + return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty, + {ConstHalving, ConstRotation, InputA, InputB}); + } + + return nullptr; +} diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -426,9 +426,13 @@ TargetPassConfig::addIRPasses(); // Run the parallel DSP pass. - if (getOptLevel() == CodeGenOpt::Aggressive) + if (getOptLevel() == CodeGenOpt::Aggressive) addPass(createARMParallelDSPPass()); + // Match complex arithmetic patterns + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(createComplexDeinterleavingPass(TM)); + // Match interleaved memory accesses to ldN/stN intrinsics. if (TM->getOptLevel() != CodeGenOpt::None) addPass(createInterleavedAccessPass()); diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll --- a/llvm/test/CodeGen/ARM/O3-pipeline.ll +++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -47,6 +47,7 @@ ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) ; CHECK-NEXT: Function Alias Analysis Results ; CHECK-NEXT: Transform functions to use DSP intrinsics +; CHECK-NEXT: Complex Deinterleaving Pass ; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: Type Promotion ; CHECK-NEXT: CodeGen Prepare diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + +; Expected to not transform +define arm_aapcs_vfpcc <2 x half> @complex_add_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: complex_add_v2f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vadd.f16 s2, s2, s0 +; CHECK-NEXT: vmovx.f16 s0, s0 +; CHECK-NEXT: vsub.f16 s0, s4, s0 +; CHECK-NEXT: vins.f16 s0, s2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x half> %b.real, %a.imag + %1 = fadd fast <1 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> + ret <2 x half> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x half> @complex_add_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: complex_add_v4f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vadd.f16 q3, q3, q0 +; CHECK-NEXT: vsub.f16 q0, q1, q2 +; CHECK-NEXT: vmovx.f16 s1, s0 +; CHECK-NEXT: vmovx.f16 s2, s12 +; CHECK-NEXT: vins.f16 s0, s12 +; CHECK-NEXT: vins.f16 s1, s2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x half> %b.real, %a.imag + %1 = fadd fast <2 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> + ret <4 x half> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: complex_add_v8f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcadd.f16 q0, q1, q0, #90 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x half> %b.real, %a.imag + %1 = fadd fast <4 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> + ret <8 x half> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) { +; CHECK-LABEL: complex_add_v16f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcadd.f16 q0, q2, q0, #90 +; CHECK-NEXT: vcadd.f16 q1, q3, q1, #90 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x half> %b.real, %a.imag + %1 = fadd fast <8 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> + ret <16 x half> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) { +; CHECK-LABEL: complex_add_v32f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vcadd.f16 q0, q4, q0, #90 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #48 +; CHECK-NEXT: vcadd.f16 q1, q4, q1, #90 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: vcadd.f16 q2, q4, q2, #90 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vcadd.f16 q3, q4, q3, #90 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> + %a.imag = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> + %b.real = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> + %b.imag = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> + %0 = fsub fast <16 x half> %b.real, %a.imag + %1 = fadd fast <16 x half> %b.imag, %a.real + %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> + ret <32 x half> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll @@ -0,0 +1,168 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +; Expected to not transform +define arm_aapcs_vfpcc <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: complex_mul_v2f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovx.f16 s2, s4 +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmul.f16 s6, s2, s0 +; CHECK-NEXT: vfma.f16 s6, s4, s8 +; CHECK-NEXT: vmul.f16 s8, s8, s2 +; CHECK-NEXT: vfnms.f16 s8, s4, s0 +; CHECK-NEXT: vins.f16 s8, s6 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> + %a.imag = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> + %b.real = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> + %b.imag = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> + %0 = fmul fast <1 x half> %b.imag, %a.real + %1 = fmul fast <1 x half> %b.real, %a.imag + %2 = fadd fast <1 x half> %1, %0 + %3 = fmul fast <1 x half> %b.real, %a.real + %4 = fmul fast <1 x half> %a.imag, %b.imag + %5 = fsub fast <1 x half> %3, %4 + %interleaved.vec = shufflevector <1 x half> %5, <1 x half> %2, <2 x i32> + ret <2 x half> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x half> @complex_mul_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: complex_mul_v4f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmovx.f16 s8, s0 +; CHECK-NEXT: vmovx.f16 s2, s1 +; CHECK-NEXT: vins.f16 s8, s2 +; CHECK-NEXT: vmovx.f16 s12, s4 +; CHECK-NEXT: vmovx.f16 s2, s5 +; CHECK-NEXT: vins.f16 s0, s1 +; CHECK-NEXT: vins.f16 s12, s2 +; CHECK-NEXT: vins.f16 s4, s5 +; CHECK-NEXT: vmul.f16 q4, q3, q0 +; CHECK-NEXT: vfma.f16 q4, q1, q2 +; CHECK-NEXT: vmul.f16 q2, q2, q3 +; CHECK-NEXT: vneg.f16 q2, q2 +; CHECK-NEXT: vfma.f16 q2, q1, q0 +; CHECK-NEXT: vmovx.f16 s0, s16 +; CHECK-NEXT: vmovx.f16 s9, s8 +; CHECK-NEXT: vins.f16 s8, s16 +; CHECK-NEXT: vins.f16 s9, s0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> + %a.imag = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> + %b.real = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> + %b.imag = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> + %0 = fmul fast <2 x half> %b.imag, %a.real + %1 = fmul fast <2 x half> %b.real, %a.imag + %2 = fadd fast <2 x half> %1, %0 + %3 = fmul fast <2 x half> %b.real, %a.real + %4 = fmul fast <2 x half> %a.imag, %b.imag + %5 = fsub fast <2 x half> %3, %4 + %interleaved.vec = shufflevector <2 x half> %5, <2 x half> %2, <4 x i32> + ret <4 x half> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: complex_mul_v8f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmul.f16 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f16 q2, q0, q1, #90 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> + %a.imag = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> + %b.real = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> + %b.imag = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> + %0 = fmul fast <4 x half> %b.imag, %a.real + %1 = fmul fast <4 x half> %b.real, %a.imag + %2 = fadd fast <4 x half> %1, %0 + %3 = fmul fast <4 x half> %b.real, %a.real + %4 = fmul fast <4 x half> %a.imag, %b.imag + %5 = fsub fast <4 x half> %3, %4 + %interleaved.vec = shufflevector <4 x half> %5, <4 x half> %2, <8 x i32> + ret <8 x half> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) { +; CHECK-LABEL: complex_mul_v16f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcmul.f16 q4, q0, q2, #0 +; CHECK-NEXT: vcmla.f16 q4, q0, q2, #90 +; CHECK-NEXT: vcmul.f16 q2, q1, q3, #0 +; CHECK-NEXT: vcmla.f16 q2, q1, q3, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> + %a.imag = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> + %b.real = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + %b.imag = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + %0 = fmul fast <8 x half> %b.imag, %a.real + %1 = fmul fast <8 x half> %b.real, %a.imag + %2 = fadd fast <8 x half> %1, %0 + %3 = fmul fast <8 x half> %b.real, %a.real + %4 = fmul fast <8 x half> %a.imag, %b.imag + %5 = fsub fast <8 x half> %3, %4 + %interleaved.vec = shufflevector <8 x half> %5, <8 x half> %2, <16 x i32> + ret <16 x half> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) { +; CHECK-LABEL: complex_mul_v32f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add r0, sp, #48 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: vcmul.f16 q0, q0, q5, #0 +; CHECK-NEXT: vcmla.f16 q0, q4, q5, #90 +; CHECK-NEXT: vldrw.u32 q5, [r0] +; CHECK-NEXT: add r0, sp, #80 +; CHECK-NEXT: vcmul.f16 q4, q1, q5, #0 +; CHECK-NEXT: vcmla.f16 q4, q1, q5, #90 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: add r0, sp, #96 +; CHECK-NEXT: vcmul.f16 q5, q2, q1, #0 +; CHECK-NEXT: vcmla.f16 q5, q2, q1, #90 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vcmul.f16 q6, q3, q1, #0 +; CHECK-NEXT: vcmla.f16 q6, q3, q1, #90 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmov q3, q6 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> + %a.imag = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> + %b.real = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + %b.imag = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + %0 = fmul fast <16 x half> %b.imag, %a.real + %1 = fmul fast <16 x half> %b.real, %a.imag + %2 = fadd fast <16 x half> %1, %0 + %3 = fmul fast <16 x half> %b.real, %a.real + %4 = fmul fast <16 x half> %a.imag, %b.imag + %5 = fsub fast <16 x half> %3, %4 + %interleaved.vec = shufflevector <16 x half> %5, <16 x half> %2, <32 x i32> + ret <32 x half> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + +; Expected to not transform +define arm_aapcs_vfpcc <2 x float> @complex_add_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: complex_add_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f32 s5, s5, s0 +; CHECK-NEXT: vsub.f32 s4, s4, s1 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x float> %b.real, %a.imag + %1 = fadd fast <1 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> + ret <2 x float> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: complex_add_v4f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcadd.f32 q2, q1, q0, #90 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x float> %b.real, %a.imag + %1 = fadd fast <2 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) { +; CHECK-LABEL: complex_add_v8f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcadd.f32 q4, q2, q0, #90 +; CHECK-NEXT: vcadd.f32 q2, q3, q1, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x float> %b.real, %a.imag + %1 = fadd fast <4 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> + ret <8 x float> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <16 x float> @complex_add_v16f32(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: complex_add_v16f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add r3, sp, #64 +; CHECK-NEXT: add r2, sp, #80 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: add r1, sp, #96 +; CHECK-NEXT: add r0, sp, #112 +; CHECK-NEXT: vcadd.f32 q4, q5, q0, #90 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vcadd.f32 q5, q0, q1, #90 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vcadd.f32 q6, q0, q2, #90 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vcadd.f32 q7, q0, q3, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q3, q7 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> + %a.imag = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> + %b.real = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> + %b.imag = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> + %0 = fsub fast <8 x float> %b.real, %a.imag + %1 = fadd fast <8 x float> %b.imag, %a.real + %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> + ret <16 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll @@ -0,0 +1,124 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +; Expected to not transform +define arm_aapcs_vfpcc <2 x float> @complex_mul_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: complex_mul_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.f32 s9, s5, s0 +; CHECK-NEXT: vmul.f32 s8, s1, s5 +; CHECK-NEXT: vfma.f32 s9, s4, s1 +; CHECK-NEXT: vfnms.f32 s8, s4, s0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> + %a.imag = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> + %b.real = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> + %b.imag = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> + %0 = fmul fast <1 x float> %b.imag, %a.real + %1 = fmul fast <1 x float> %b.real, %a.imag + %2 = fadd fast <1 x float> %1, %0 + %3 = fmul fast <1 x float> %b.real, %a.real + %4 = fmul fast <1 x float> %a.imag, %b.imag + %5 = fsub fast <1 x float> %3, %4 + %interleaved.vec = shufflevector <1 x float> %5, <1 x float> %2, <2 x i32> + ret <2 x float> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: complex_mul_v4f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %b.imag, %a.real + %1 = fmul fast <2 x float> %b.real, %a.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %b.real, %a.real + %4 = fmul fast <2 x float> %a.imag, %b.imag + %5 = fsub fast <2 x float> %3, %4 + %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) { +; CHECK-LABEL: complex_mul_v8f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcmul.f32 q4, q0, q2, #0 +; CHECK-NEXT: vcmla.f32 q4, q0, q2, #90 +; CHECK-NEXT: vcmul.f32 q2, q1, q3, #0 +; CHECK-NEXT: vcmla.f32 q2, q1, q3, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> + %a.imag = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> + %b.real = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + %b.imag = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + %0 = fmul fast <4 x float> %b.imag, %a.real + %1 = fmul fast <4 x float> %b.real, %a.imag + %2 = fadd fast <4 x float> %1, %0 + %3 = fmul fast <4 x float> %b.real, %a.real + %4 = fmul fast <4 x float> %a.imag, %b.imag + %5 = fsub fast <4 x float> %3, %4 + %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> + ret <8 x float> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) { +; CHECK-LABEL: complex_mul_v16f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add r3, sp, #64 +; CHECK-NEXT: add r2, sp, #80 +; CHECK-NEXT: vldrw.u32 q5, [r3] +; CHECK-NEXT: add r1, sp, #96 +; CHECK-NEXT: add r0, sp, #112 +; CHECK-NEXT: vcmul.f32 q4, q0, q5, #0 +; CHECK-NEXT: vcmla.f32 q4, q0, q5, #90 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vcmul.f32 q5, q1, q0, #0 +; CHECK-NEXT: vcmla.f32 q5, q1, q0, #90 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vmov q1, q5 +; CHECK-NEXT: vcmul.f32 q6, q2, q0, #0 +; CHECK-NEXT: vcmla.f32 q6, q2, q0, #90 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vcmul.f32 q7, q3, q0, #0 +; CHECK-NEXT: vcmla.f32 q7, q3, q0, #90 +; CHECK-NEXT: vmov q0, q4 +; CHECK-NEXT: vmov q3, q7 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> + %a.imag = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> + %b.real = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + %b.imag = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + %0 = fmul fast <8 x float> %b.imag, %a.real + %1 = fmul fast <8 x float> %b.real, %a.imag + %2 = fadd fast <8 x float> %1, %0 + %3 = fmul fast <8 x float> %b.real, %a.real + %4 = fmul fast <8 x float> %a.imag, %b.imag + %5 = fsub fast <8 x float> %3, %4 + %interleaved.vec = shufflevector <8 x float> %5, <8 x float> %2, <16 x i32> + ret <16 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + + +; Expected to not transform +define arm_aapcs_vfpcc <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_add_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f64 d3, d3, d0 +; CHECK-NEXT: vsub.f64 d2, d2, d1 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> + %0 = fsub fast <1 x double> %b.real, %a.imag + %1 = fadd fast <1 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> + ret <2 x double> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_add_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.f64 d5, d5, d0 +; CHECK-NEXT: vsub.f64 d4, d4, d1 +; CHECK-NEXT: vadd.f64 d7, d7, d2 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: vsub.f64 d6, d6, d3 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> + %0 = fsub fast <2 x double> %b.real, %a.imag + %1 = fadd fast <2 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_add_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: add r0, sp, #32 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add r0, sp, #48 +; CHECK-NEXT: vadd.f64 d1, d1, d2 +; CHECK-NEXT: vsub.f64 d0, d0, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: add r0, sp, #64 +; CHECK-NEXT: vadd.f64 d3, d3, d8 +; CHECK-NEXT: vsub.f64 d2, d2, d9 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #80 +; CHECK-NEXT: vadd.f64 d9, d9, d4 +; CHECK-NEXT: vsub.f64 d8, d8, d5 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vadd.f64 d11, d5, d6 +; CHECK-NEXT: vsub.f64 d10, d4, d7 +; CHECK-NEXT: vmov q2, q4 +; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> + %0 = fsub fast <4 x double> %b.real, %a.imag + %1 = fadd fast <4 x double> %b.imag, %a.real + %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +; Expected to not transform +define arm_aapcs_vfpcc <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: complex_mul_v2f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmul.f64 d5, d3, d0 +; CHECK-NEXT: vmul.f64 d4, d1, d3 +; CHECK-NEXT: vfma.f64 d5, d2, d1 +; CHECK-NEXT: vfnms.f64 d4, d2, d0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %a.imag = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> + %b.real = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %b.imag = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> + %0 = fmul fast <1 x double> %b.imag, %a.real + %1 = fmul fast <1 x double> %b.real, %a.imag + %2 = fadd fast <1 x double> %1, %0 + %3 = fmul fast <1 x double> %b.real, %a.real + %4 = fmul fast <1 x double> %a.imag, %b.imag + %5 = fsub fast <1 x double> %3, %4 + %interleaved.vec = shufflevector <1 x double> %5, <1 x double> %2, <2 x i32> + ret <2 x double> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) { +; CHECK-LABEL: complex_mul_v4f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmul.f64 d9, d7, d2 +; CHECK-NEXT: vmov q5, q0 +; CHECK-NEXT: vmul.f64 d8, d3, d7 +; CHECK-NEXT: vfma.f64 d9, d6, d3 +; CHECK-NEXT: vfnms.f64 d8, d6, d2 +; CHECK-NEXT: vmul.f64 d1, d5, d10 +; CHECK-NEXT: vmov q1, q4 +; CHECK-NEXT: vmul.f64 d0, d11, d5 +; CHECK-NEXT: vfma.f64 d1, d4, d11 +; CHECK-NEXT: vfnms.f64 d0, d4, d10 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %a.imag = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> + %b.real = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %b.imag = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + %0 = fmul fast <2 x double> %b.imag, %a.real + %1 = fmul fast <2 x double> %b.real, %a.imag + %2 = fadd fast <2 x double> %1, %0 + %3 = fmul fast <2 x double> %b.real, %a.real + %4 = fmul fast <2 x double> %a.imag, %b.imag + %5 = fsub fast <2 x double> %3, %4 + %interleaved.vec = shufflevector <2 x double> %5, <2 x double> %2, <4 x i32> + ret <4 x double> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) { +; CHECK-LABEL: complex_mul_v8f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: add r0, sp, #128 +; CHECK-NEXT: vmov q7, q1 +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: add r0, sp, #160 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov q6, q0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: add r0, sp, #176 +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d5, d3, d0 +; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q5, [sp] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d4, d1, d3 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov q0, q5 +; CHECK-NEXT: add r0, sp, #144 +; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmul.f64 d11, d3, d0 +; CHECK-NEXT: vmul.f64 d10, d1, d3 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vmul.f64 d7, d9, d12 +; CHECK-NEXT: vmul.f64 d2, d15, d1 +; CHECK-NEXT: vmul.f64 d3, d1, d14 +; CHECK-NEXT: vmul.f64 d6, d13, d9 +; CHECK-NEXT: vfma.f64 d7, d8, d13 +; CHECK-NEXT: vfnms.f64 d6, d8, d12 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d3, d0, d15 +; CHECK-NEXT: vfnms.f64 d2, d0, d14 +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d5, d0, d9 +; CHECK-NEXT: vfnms.f64 d4, d0, d8 +; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vfma.f64 d11, d0, d9 +; CHECK-NEXT: vfnms.f64 d10, d0, d8 +; CHECK-NEXT: vmov q0, q3 +; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: add sp, #64 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %a.imag = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> + %b.real = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %b.imag = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + %0 = fmul fast <4 x double> %b.imag, %a.real + %1 = fmul fast <4 x double> %b.real, %a.imag + %2 = fadd fast <4 x double> %1, %0 + %3 = fmul fast <4 x double> %b.real, %a.real + %4 = fmul fast <4 x double> %a.imag, %b.imag + %5 = fsub fast <4 x double> %3, %4 + %interleaved.vec = shufflevector <4 x double> %5, <4 x double> %2, <8 x i32> + ret <8 x double> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll @@ -0,0 +1,387 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +; Expected to transform +define arm_aapcs_vfpcc <4 x float> @mul_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_mul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmul.f32 q3, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q3, q0, q1, #90 +; CHECK-NEXT: vcmul.f32 q0, q3, q2, #0 +; CHECK-NEXT: vcmla.f32 q0, q3, q2, #90 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec151 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec153 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec154 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec154, %strided.vec151 + %1 = fmul fast <2 x float> %strided.vec153, %strided.vec + %2 = fmul fast <2 x float> %strided.vec154, %strided.vec + %3 = fmul fast <2 x float> %strided.vec153, %strided.vec151 + %4 = fadd fast <2 x float> %3, %2 + %5 = fsub fast <2 x float> %1, %0 + %strided.vec156 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec157 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %6 = fmul fast <2 x float> %4, %strided.vec156 + %7 = fmul fast <2 x float> %5, %strided.vec157 + %8 = fadd fast <2 x float> %6, %7 + %9 = fmul fast <2 x float> %strided.vec156, %5 + %10 = fmul fast <2 x float> %4, %strided.vec157 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x float> @add_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: add_mul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vsub.f32 q3, q1, q2 +; CHECK-NEXT: vsub.f32 q0, q1, q0 +; CHECK-NEXT: vmov.f32 s16, s9 +; CHECK-NEXT: vmov.f32 s13, s14 +; CHECK-NEXT: vmov.f32 s17, s11 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmul.f32 q1, q3, q4 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov.f32 s9, s10 +; CHECK-NEXT: vfma.f32 q1, q2, q0 +; CHECK-NEXT: vmul.f32 q0, q4, q0 +; CHECK-NEXT: vneg.f32 q4, q0 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vfma.f32 q4, q2, q3 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov.f32 s0, s16 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %0 = fsub fast <4 x float> %b, %c + %1 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> + %strided.vec58 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec59 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %2 = fmul fast <2 x float> %1, %strided.vec59 + %3 = fsub fast <4 x float> %b, %a + %4 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> + %5 = fmul fast <2 x float> %strided.vec58, %4 + %6 = fadd fast <2 x float> %5, %2 + %7 = fmul fast <2 x float> %strided.vec58, %1 + %8 = fmul fast <2 x float> %strided.vec59, %4 + %9 = fsub fast <2 x float> %7, %8 + %interleaved.vec = shufflevector <2 x float> %9, <2 x float> %6, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_mul270_mul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d12} +; CHECK-NEXT: vpush {d12} +; CHECK-NEXT: .vsave {d10} +; CHECK-NEXT: vpush {d10} +; CHECK-NEXT: .vsave {d8} +; CHECK-NEXT: vpush {d8} +; CHECK-NEXT: vmov.f32 s20, s4 +; CHECK-NEXT: vmov.f32 s16, s8 +; CHECK-NEXT: vmov.f32 s17, s10 +; CHECK-NEXT: vmov.f32 s21, s6 +; CHECK-NEXT: vmul.f32 q3, q5, q4 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vneg.f32 q3, q3 +; CHECK-NEXT: vmov.f32 s24, s9 +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vmul.f32 q2, q1, q4 +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vfma.f32 q3, q1, q6 +; CHECK-NEXT: vmov.f32 s17, s2 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vfma.f32 q2, q5, q6 +; CHECK-NEXT: vmul.f32 q1, q3, q4 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vfma.f32 q1, q2, q0 +; CHECK-NEXT: vmul.f32 q0, q3, q0 +; CHECK-NEXT: vneg.f32 q3, q0 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vfma.f32 q3, q2, q4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov.f32 s0, s12 +; CHECK-NEXT: vmov.f32 s2, s13 +; CHECK-NEXT: vpop {d8} +; CHECK-NEXT: vpop {d10} +; CHECK-NEXT: vpop {d12} +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec81 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec83 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec84 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec84, %strided.vec + %1 = fmul fast <2 x float> %strided.vec83, %strided.vec81 + %2 = fadd fast <2 x float> %1, %0 + %strided.vec86 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec87 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %3 = fmul fast <2 x float> %2, %strided.vec87 + %4 = fmul fast <2 x float> %strided.vec84, %strided.vec81 + %5 = fmul fast <2 x float> %strided.vec83, %strided.vec + %6 = fsub fast <2 x float> %4, %5 + %7 = fmul fast <2 x float> %6, %strided.vec86 + %8 = fadd fast <2 x float> %3, %7 + %9 = fmul fast <2 x float> %2, %strided.vec86 + %10 = fmul fast <2 x float> %6, %strided.vec87 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; (a * b) * a +; Expected to transform +define arm_aapcs_vfpcc <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: mul_triangle: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmul.f32 q2, q1, q0, #0 +; CHECK-NEXT: vcmla.f32 q2, q1, q0, #90 +; CHECK-NEXT: vcmul.f32 q1, q0, q2, #0 +; CHECK-NEXT: vcmla.f32 q1, q0, q2, #90 +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec37, %strided.vec + %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35 + %2 = fsub fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %2, %strided.vec35 + %4 = fmul fast <2 x float> %strided.vec38, %strided.vec + %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37 + %6 = fadd fast <2 x float> %4, %5 + %7 = fmul fast <2 x float> %6, %strided.vec + %8 = fadd fast <2 x float> %3, %7 + %9 = fmul fast <2 x float> %2, %strided.vec + %10 = fmul fast <2 x float> %6, %strided.vec35 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} + + +; d * (b * a) * (c * a) +; Expected to transform +define arm_aapcs_vfpcc <4 x float> @mul_diamond(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) { +; CHECK-LABEL: mul_diamond: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcmul.f32 q4, q1, q0, #0 +; CHECK-NEXT: vcmla.f32 q4, q1, q0, #90 +; CHECK-NEXT: vcmul.f32 q1, q4, q3, #0 +; CHECK-NEXT: vcmla.f32 q1, q4, q3, #90 +; CHECK-NEXT: vcmul.f32 q3, q2, q0, #0 +; CHECK-NEXT: vcmla.f32 q3, q2, q0, #90 +; CHECK-NEXT: vcmul.f32 q0, q3, q1, #0 +; CHECK-NEXT: vcmla.f32 q0, q3, q1, #90 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %d.real = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> + %d.imag = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %a.imag, %b.real + %1 = fmul fast <2 x float> %a.real, %b.imag + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %a.real, %b.real + %4 = fmul fast <2 x float> %b.imag, %a.imag + %5 = fsub fast <2 x float> %3, %4 + %6 = fmul fast <2 x float> %d.real, %5 + %7 = fmul fast <2 x float> %2, %d.imag + %8 = fmul fast <2 x float> %d.real, %2 + %9 = fmul fast <2 x float> %5, %d.imag + %10 = fsub fast <2 x float> %6, %7 + %11 = fadd fast <2 x float> %8, %9 + %12 = fmul fast <2 x float> %c.real, %a.imag + %13 = fmul fast <2 x float> %c.imag, %a.real + %14 = fadd fast <2 x float> %13, %12 + %15 = fmul fast <2 x float> %14, %10 + %16 = fmul fast <2 x float> %c.real, %a.real + %17 = fmul fast <2 x float> %c.imag, %a.imag + %18 = fsub fast <2 x float> %16, %17 + %19 = fmul fast <2 x float> %18, %11 + %20 = fadd fast <2 x float> %15, %19 + %21 = fmul fast <2 x float> %18, %10 + %22 = fmul fast <2 x float> %14, %11 + %23 = fsub fast <2 x float> %21, %22 + %interleaved.vec = shufflevector <2 x float> %23, <2 x float> %20, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <4 x float> @mul_add90_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_add90_mul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vcmul.f32 q3, q2, q0, #0 +; CHECK-NEXT: vcmul.f32 q4, q1, q0, #0 +; CHECK-NEXT: vcmla.f32 q4, q1, q0, #90 +; CHECK-NEXT: vcmla.f32 q3, q2, q0, #90 +; CHECK-NEXT: vcadd.f32 q0, q3, q4, #90 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + + %i6 = fmul fast <2 x float> %br, %ar + %i7 = fmul fast <2 x float> %bi, %ai + %xr = fsub fast <2 x float> %i6, %i7 + %i9 = fmul fast <2 x float> %bi, %ar + %i10 = fmul fast <2 x float> %br, %ai + %xi = fadd fast <2 x float> %i9, %i10 + + %j6 = fmul fast <2 x float> %cr, %ar + %j7 = fmul fast <2 x float> %ci, %ai + %yr = fsub fast <2 x float> %j6, %j7 + %j9 = fmul fast <2 x float> %ci, %ar + %j10 = fmul fast <2 x float> %cr, %ai + %yi = fadd fast <2 x float> %j9, %j10 + + %zr = fsub fast <2 x float> %yr, %xi + %zi = fadd fast <2 x float> %yi, %xr + %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: mul_triangle_addmul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vmov.f32 s16, s0 +; CHECK-NEXT: vmov.f32 s20, s5 +; CHECK-NEXT: vmov.f32 s17, s2 +; CHECK-NEXT: vmov.f32 s21, s7 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmul.f32 q3, q5, q4 +; CHECK-NEXT: vmul.f32 q4, q1, q4 +; CHECK-NEXT: vmov.f32 s0, s1 +; CHECK-NEXT: vmov.f32 s1, s3 +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vfms.f32 q6, q5, q0 +; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vfma.f32 q3, q1, q0 +; CHECK-NEXT: vmov.f32 s20, s8 +; CHECK-NEXT: vmov.f32 s21, s10 +; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vfma.f32 q7, q5, q0 +; CHECK-NEXT: vmov.f32 s5, s11 +; CHECK-NEXT: vadd.f32 q5, q7, q6 +; CHECK-NEXT: vfms.f32 q4, q1, q0 +; CHECK-NEXT: vmov.f32 s1, s20 +; CHECK-NEXT: vsub.f32 q1, q4, q3 +; CHECK-NEXT: vmov.f32 s3, s21 +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + + %i6 = fmul fast <2 x float> %br, %ar + %i7 = fmul fast <2 x float> %bi, %ai + %xr = fsub fast <2 x float> %i6, %i7 + %i9 = fmul fast <2 x float> %bi, %ar + %i10 = fmul fast <2 x float> %br, %ai + %xi = fadd fast <2 x float> %i9, %i10 + + ;%j6 = fmul fast <2 x float> %cr, %ar + %j7 = fmul fast <2 x float> %ci, %ai + %yr = fsub fast <2 x float> %i6, %j7 + ;%j9 = fmul fast <2 x float> %ci, %ar + %j10 = fmul fast <2 x float> %cr, %ai + %yi = fadd fast <2 x float> %i9, %j10 + + %zr = fsub fast <2 x float> %yr, %xi + %zi = fadd fast <2 x float> %yi, %xr + %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x float> %b, ptr %p) { +; CHECK-LABEL: mul_triangle_multiuses: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vmov.f32 s8, s1 +; CHECK-NEXT: vmov.f32 s17, s6 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmul.f32 q3, q2, q4 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vfma.f32 q3, q1, q0 +; CHECK-NEXT: vmul.f32 q1, q1, q2 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q4, q0 +; CHECK-NEXT: vmov.f32 s18, s12 +; CHECK-NEXT: vmov.f32 s16, s4 +; CHECK-NEXT: vmov.f32 s17, s5 +; CHECK-NEXT: vmov.f32 s19, s13 +; CHECK-NEXT: vstrw.32 q4, [r0] +; CHECK-NEXT: vmul.f32 q4, q3, q0 +; CHECK-NEXT: vfma.f32 q4, q1, q2 +; CHECK-NEXT: vmul.f32 q2, q3, q2 +; CHECK-NEXT: vneg.f32 q2, q2 +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vmov.f32 s1, s16 +; CHECK-NEXT: vmov.f32 s0, s8 +; CHECK-NEXT: vmov.f32 s2, s9 +; CHECK-NEXT: vmov.f32 s3, s17 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec37, %strided.vec + %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35 + %2 = fsub fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %2, %strided.vec35 + %4 = fmul fast <2 x float> %strided.vec38, %strided.vec + %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37 + %6 = fadd fast <2 x float> %4, %5 + %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> + store <4 x float> %otheruse, ptr %p + %7 = fmul fast <2 x float> %6, %strided.vec + %8 = fadd fast <2 x float> %3, %7 + %9 = fmul fast <2 x float> %2, %strided.vec + %10 = fmul fast <2 x float> %6, %strided.vec35 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll @@ -0,0 +1,316 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s + +target triple = "thumbv8.1m.main-none-none-eabi" + +; Expected to transform +define arm_aapcs_vfpcc <4 x float> @simple_mul(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: simple_mul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 +; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec20, %strided.vec + %1 = fmul fast <2 x float> %strided.vec19, %strided.vec17 + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %strided.vec19, %strided.vec + %4 = fmul fast <2 x float> %strided.vec17, %strided.vec20 + %5 = fsub fast <2 x float> %3, %4 + %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x float> @simple_mul_no_contract(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: simple_mul_no_contract: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vmul.f32 q4, q3, q2 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmul.f32 q2, q2, q0 +; CHECK-NEXT: vmul.f32 q5, q1, q0 +; CHECK-NEXT: vfma.f32 q2, q1, q3 +; CHECK-NEXT: vsub.f32 q4, q5, q4 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s0, s16 +; CHECK-NEXT: vmov.f32 s2, s17 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec20, %strided.vec + %1 = fmul fast <2 x float> %strided.vec19, %strided.vec17 + %2 = fadd fast <2 x float> %1, %0 + %3 = fmul fast <2 x float> %strided.vec19, %strided.vec + %4 = fmul fast <2 x float> %strided.vec17, %strided.vec20 + %5 = fsub <2 x float> %3, %4 + %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <4 x float> @three_way_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: three_way_mul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmul.f32 q3, q1, q0, #0 +; CHECK-NEXT: vcmla.f32 q3, q1, q0, #90 +; CHECK-NEXT: vcmul.f32 q0, q2, q3, #0 +; CHECK-NEXT: vcmla.f32 q0, q2, q3, #90 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec39 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec41 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec42 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec44 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %strided.vec45 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> + %0 = fmul fast <2 x float> %strided.vec41, %strided.vec + %1 = fmul fast <2 x float> %strided.vec42, %strided.vec39 + %2 = fsub fast <2 x float> %0, %1 + %3 = fmul fast <2 x float> %2, %strided.vec45 + %4 = fmul fast <2 x float> %strided.vec42, %strided.vec + %5 = fmul fast <2 x float> %strided.vec39, %strided.vec41 + %6 = fadd fast <2 x float> %4, %5 + %7 = fmul fast <2 x float> %6, %strided.vec44 + %8 = fadd fast <2 x float> %3, %7 + %9 = fmul fast <2 x float> %2, %strided.vec44 + %10 = fmul fast <2 x float> %6, %strided.vec45 + %11 = fsub fast <2 x float> %9, %10 + %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <4 x float> @simple_add_90(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: simple_add_90: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcadd.f32 q2, q1, q0, #90 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fsub fast <2 x float> %strided.vec19, %strided.vec17 + %1 = fadd fast <2 x float> %strided.vec20, %strided.vec + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform, fadd commutativity is not yet implemented +define arm_aapcs_vfpcc <4 x float> @simple_add_270_false(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: simple_add_270_false: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vsub.f32 q2, q3, q2 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vadd.f32 q1, q1, q0 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fadd fast <2 x float> %strided.vec20, %strided.vec + %1 = fsub fast <2 x float> %strided.vec17, %strided.vec19 + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to transform +define arm_aapcs_vfpcc <4 x float> @simple_add_270_true(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: simple_add_270_true: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcadd.f32 q2, q0, q1, #270 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fadd fast <2 x float> %strided.vec, %strided.vec20 + %1 = fsub fast <2 x float> %strided.vec17, %strided.vec19 + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <4 x float> @add_external_use(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: add_external_use: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s4 +; CHECK-NEXT: vmov.f32 s12, s1 +; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s13, s3 +; CHECK-NEXT: vmov.f32 s5, s7 +; CHECK-NEXT: vadd.f32 q2, q3, q2 +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vsub.f32 q1, q0, q1 +; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s0, s4 +; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov.f32 s3, s9 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fsub fast <2 x float> %a.real, %b.imag + %1 = fadd fast <2 x float> %a.imag, %b.real + %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> + %dup = shufflevector <2 x float> %0, <2 x float> poison, <4 x i32> + %interleaved.vec2 = shufflevector <4 x float> %interleaved.vec, <4 x float> %dup, <4 x i32> + ret <4 x float> %interleaved.vec2 +} + +define arm_aapcs_vfpcc <4 x float> @mul_mul_with_fneg(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: mul_mul_with_fneg: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcmul.f32 q2, q1, q0, #270 +; CHECK-NEXT: vcmla.f32 q2, q1, q0, #180 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> + %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> + %0 = fneg fast <2 x float> %a.imag + %1 = fmul fast <2 x float> %b.real, %0 + %2 = fmul fast <2 x float> %a.real, %b.imag + %3 = fsub fast <2 x float> %1, %2 + %4 = fmul fast <2 x float> %b.imag, %a.imag + %5 = fmul fast <2 x float> %a.real, %b.real + %6 = fsub fast <2 x float> %4, %5 + %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %3, <4 x i32> + ret <4 x float> %interleaved.vec +} + +; Expected to not transform +define arm_aapcs_vfpcc <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) { +; CHECK-LABEL: abp90c12: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: vldr s23, [sp, #124] +; CHECK-NEXT: vmov.f32 s20, s13 +; CHECK-NEXT: vldr s22, [sp, #116] +; CHECK-NEXT: vmov.f32 s25, s11 +; CHECK-NEXT: vmov.f32 s13, s10 +; CHECK-NEXT: vldr s19, [sp, #120] +; CHECK-NEXT: vmov.f32 s11, s6 +; CHECK-NEXT: vldr s18, [sp, #112] +; CHECK-NEXT: vmov.f32 s6, s5 +; CHECK-NEXT: vldr s31, [sp, #172] +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vldr s30, [sp, #164] +; CHECK-NEXT: vmov.f32 s21, s15 +; CHECK-NEXT: vldr s29, [sp, #156] +; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vldr s28, [sp, #148] +; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: vmov.f32 s24, s9 +; CHECK-NEXT: vmov.f32 s16, s12 +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.f32 s12, s8 +; CHECK-NEXT: vldr s27, [sp, #168] +; CHECK-NEXT: vmov.f32 s17, s14 +; CHECK-NEXT: vldr s26, [sp, #160] +; CHECK-NEXT: vmov.f32 s9, s2 +; CHECK-NEXT: vldr s25, [sp, #152] +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmul.f32 q0, q5, q1 +; CHECK-NEXT: vmul.f32 q1, q4, q1 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: vldr s24, [sp, #144] +; CHECK-NEXT: vfma.f32 q1, q5, q2 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vsub.f32 q6, q6, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload +; CHECK-NEXT: vldr s13, [sp, #140] +; CHECK-NEXT: vfma.f32 q1, q4, q2 +; CHECK-NEXT: vldr s12, [sp, #132] +; CHECK-NEXT: vadd.f32 q1, q7, q1 +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldr s1, [sp, #136] +; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmul.f32 q2, q3, q7 +; CHECK-NEXT: vldr s0, [sp, #128] +; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vneg.f32 q2, q2 +; CHECK-NEXT: vldr s21, [sp, #184] +; CHECK-NEXT: vfma.f32 q2, q0, q3 +; CHECK-NEXT: vmul.f32 q0, q0, q7 +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldr s20, [sp, #176] +; CHECK-NEXT: vldr s17, [sp, #188] +; CHECK-NEXT: vldr s16, [sp, #180] +; CHECK-NEXT: vfma.f32 q0, q7, q3 +; CHECK-NEXT: vsub.f32 q3, q5, q0 +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vadd.f32 q4, q4, q2 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s0, s24 +; CHECK-NEXT: vmov.f32 s2, s25 +; CHECK-NEXT: vmov.f32 s4, s26 +; CHECK-NEXT: vmov.f32 s6, s27 +; CHECK-NEXT: vmov.f32 s8, s12 +; CHECK-NEXT: vmov.f32 s9, s16 +; CHECK-NEXT: vmov.f32 s10, s13 +; CHECK-NEXT: vmov.f32 s11, s17 +; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %ar = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> + %ai = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> + %br = shufflevector <12 x float> %b, <12 x float> poison, <6 x i32> + %bi = shufflevector <12 x float> %b, <12 x float> poison, <6 x i32> + %cr = shufflevector <12 x float> %c, <12 x float> poison, <6 x i32> + %ci = shufflevector <12 x float> %c, <12 x float> poison, <6 x i32> + + %i6 = fmul fast <6 x float> %br, %ar + %i7 = fmul fast <6 x float> %bi, %ai + %xr = fsub fast <6 x float> %i6, %i7 + %i9 = fmul fast <6 x float> %bi, %ar + %i10 = fmul fast <6 x float> %br, %ai + %xi = fadd fast <6 x float> %i9, %i10 + + %zr = fsub fast <6 x float> %cr, %xi + %zi = fadd fast <6 x float> %ci, %xr + %interleaved.vec = shufflevector <6 x float> %zr, <6 x float> %zi, <12 x i32> + ret <12 x float> %interleaved.vec +}