diff --git a/llvm/include/llvm/CodeGen/ExpandComplex.h b/llvm/include/llvm/CodeGen/ExpandComplex.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/CodeGen/ExpandComplex.h @@ -0,0 +1,22 @@ +//===---- ExpandComplex.h - Expand experimental complex intrinsics --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_EXPANDCOMPLEX_H +#define LLVM_CODEGEN_EXPANDCOMPLEX_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class ExpandComplexPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_CODEGEN_EXPANDCOMPLEX_H diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1244,6 +1244,9 @@ VECREDUCE_UMAX, VECREDUCE_UMIN, + /// COMPLEX_MUL - Do a naive complex multiplication. + COMPLEX_MUL, + // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(VPSDID, ...) VPSDID, #include "llvm/IR/VPIntrinsics.def" diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -472,6 +472,10 @@ /// printing assembly. ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions = true); + /// This pass expands the experimental complex intrinsics into regular + /// floating-point arithmetic or calls to __mulsc3 (or similar) functions. + FunctionPass *createExpandComplexPass(); + /// This pass expands the experimental reduction intrinsics into sequences of /// shuffles. FunctionPass *createExpandReductionsPass(); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -652,6 +652,24 @@ return false; } + /// Enum that specifies how a C complex type is lowered (in LLVM type terms). + enum class ComplexABI { + Memory, ///< Indicates that a pointer to the struct is passed. + Vector, ///< Indicates that T _Complex can be passed as <2 x T>. + Struct, ///< Indicates that T _Complex can be passed as {T, T}. + Integer, ///< Indicates that an integer of the same size is passed. + }; + + /// Returns how a C complex type is lowered when used as the return value. + virtual ComplexABI getComplexReturnABI(Type *ScalarFloatTy) const { + return ComplexABI::Struct; + } + + /// Returns true if the target can match the @llvm.experimental.complex.fmul + /// intrinsic with the given type. Such an intrinsic is assumed will only be + /// matched when "complex-range" is "limited" or "no-nan". + virtual bool CustomLowerComplexMultiply(Type *FloatTy) const { return false; } + /// Return if the target supports combining a /// chain like: /// \code @@ -2515,6 +2533,7 @@ case ISD::FMAXNUM_IEEE: case ISD::FMINIMUM: case ISD::FMAXIMUM: + case ISD::COMPLEX_MUL: return true; default: return false; } diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -154,6 +154,7 @@ void initializeEHContGuardCatchretPass(PassRegistry &); void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&); void initializeEntryExitInstrumenterPass(PassRegistry&); +void initializeExpandComplexPass(PassRegistry &); void initializeExpandMemCmpPassPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); void initializeExpandReductionsPass(PassRegistry&); diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -703,6 +703,7 @@ def assertzext : SDNode<"ISD::AssertZext", SDT_assert>; def assertalign : SDNode<"ISD::AssertAlign", SDT_assert>; +def COMPLEX_MUL : SDNode<"ISD::COMPLEX_MUL", SDTFPBinOp, [SDNPCommutative]>; //===----------------------------------------------------------------------===// // Selection DAG Condition Codes diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -53,6 +53,7 @@ EdgeBundles.cpp EHContGuardCatchret.cpp ExecutionDomainFix.cpp + ExpandComplex.cpp ExpandMemCmp.cpp ExpandPostRAPseudos.cpp ExpandReductions.cpp diff --git a/llvm/lib/CodeGen/ExpandComplex.cpp b/llvm/lib/CodeGen/ExpandComplex.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/CodeGen/ExpandComplex.cpp @@ -0,0 +1,293 @@ +//===-- ExpandComplex.cpp - Expand experimental complex intrinsics --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass implements IR expansion for complex intrinsics, allowing targets +// to enable the intrinsics until just before codegen. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ExpandComplex.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +namespace { + +bool expandComplexInstruction(IntrinsicInst *CI, const TargetLowering *TLI, + const DataLayout &DL) { + Intrinsic::ID Opcode = CI->getIntrinsicID(); + assert((Opcode == Intrinsic::experimental_complex_fmul || + Opcode == Intrinsic::experimental_complex_fdiv) && + "Expected a complex instruction"); + + // Break the input values up into real and imaginary pieces. + Type *ComplexVectorTy = CI->getArgOperand(0)->getType(); + Type *FloatTy = ComplexVectorTy->getScalarType(); + IRBuilder<> Builder(CI); + Builder.setFastMathFlags(CI->getFastMathFlags()); + Value *LhsR = Builder.CreateExtractElement(CI->getArgOperand(0), uint64_t(0)); + Value *LhsI = Builder.CreateExtractElement(CI->getArgOperand(0), uint64_t(1)); + Value *RhsR = nullptr, *RhsI = nullptr; + RhsR = Builder.CreateExtractElement(CI->getArgOperand(1), uint64_t(0)); + RhsI = Builder.CreateExtractElement(CI->getArgOperand(1), uint64_t(1)); + + // The expansion has three pieces: the naive arithmetic, a possible prescaling + // (not relevant for multiplication), and a step to convert NaN output values + // to infinity values in certain situations (see Annex G of the C + // specification for more details). The "complex-range" attribute determines + // how many we need: "limited" has just the first one, "no-nan" the first two, + // and "full" for all three. + + // Get the "complex-range" attribute, setting a default based on the presence + // of fast-math flags. + StringRef Range = CI->getFnAttr("complex-range").getValueAsString(); + if (Range.empty()) { + Range = CI->getFastMathFlags().noNaNs() || CI->getFastMathFlags().noInfs() + ? "no-nan" + : "full"; + } + + // We can expand to naive arithmetic code if we only need the first piece. For + // multiplication, we can also accept "no-nan", since there is no semantic + // difference between "limited" and "no-nan" in that case. + bool CanExpand = + Range == "limited" || + (Range == "no-nan" && Opcode == Intrinsic::experimental_complex_fmul); + + Value *OutReal, *OutImag; + if (!CanExpand) { + // Do a call directly to the compiler-rt library here. + const char *Name = nullptr; + if (Opcode == Intrinsic::experimental_complex_fmul) { + if (FloatTy->isHalfTy()) + Name = "__mulhc3"; + else if (FloatTy->isFloatTy()) + Name = "__mulsc3"; + else if (FloatTy->isDoubleTy()) + Name = "__muldc3"; + else if (FloatTy->isX86_FP80Ty()) + Name = "__mulxc3"; + else if (FloatTy->isFP128Ty() || FloatTy->isPPC_FP128Ty()) + Name = "__multc3"; + } else if (Opcode == Intrinsic::experimental_complex_fdiv) { + if (FloatTy->isHalfTy()) + Name = "__divhc3"; + else if (FloatTy->isFloatTy()) + Name = "__divsc3"; + else if (FloatTy->isDoubleTy()) + Name = "__divdc3"; + else if (FloatTy->isX86_FP80Ty()) + Name = "__divxc3"; + else if (FloatTy->isFP128Ty() || FloatTy->isPPC_FP128Ty()) + Name = "__divtc3"; + } + + if (!Name) + report_fatal_error("Cannot find libcall for intrinsic"); + + // The function we are to call is T complex __name(T, T, T, T) in C terms. + // Use TLI to figure out what the appropriate actual ABI for this function. + StructType *ComplexStructTy = StructType::get(FloatTy, FloatTy); + switch (TLI->getComplexReturnABI(FloatTy)) { + case TargetLowering::ComplexABI::Vector: { + // When the result is a vector type directly, we can replace the intrinsic + // with the call to the underlying function without any other munging. + FunctionCallee Func = CI->getModule()->getOrInsertFunction( + Name, ComplexVectorTy, FloatTy, FloatTy, FloatTy, FloatTy); + Value *NewResult = Builder.CreateCall(Func, {LhsR, LhsI, RhsR, RhsI}); + CI->replaceAllUsesWith(NewResult); + CI->eraseFromParent(); + return true; + } + case TargetLowering::ComplexABI::Integer: { + // This ABI form packs the type as a small struct in an integer register. + // All we need to do is move the integer to a vector register, without any + // other munging. + uint64_t Width = ComplexVectorTy->getPrimitiveSizeInBits().getFixedSize(); + Type *IntegerTy = Builder.getIntNTy(Width); + FunctionCallee Func = CI->getModule()->getOrInsertFunction( + Name, IntegerTy, FloatTy, FloatTy, FloatTy, FloatTy); + Value *NewResult = Builder.CreateBitCast( + Builder.CreateCall(Func, {LhsR, LhsI, RhsR, RhsI}), ComplexVectorTy); + CI->replaceAllUsesWith(NewResult); + CI->eraseFromParent(); + return true; + } + case TargetLowering::ComplexABI::Memory: { + // Allocate a struct for the return type in the entry block. Stack slot + // coloring should remove duplicate allocations. + unsigned AllocaAS = DL.getAllocaAddrSpace(); + Value *Alloca; + { + IRBuilderBase::InsertPointGuard Guard(Builder); + BasicBlock *EntryBB = &CI->getParent()->getParent()->getEntryBlock(); + Builder.SetInsertPoint(EntryBB, EntryBB->begin()); + Alloca = Builder.CreateAlloca(ComplexStructTy, AllocaAS); + } + + AttributeList Attrs; + Attrs = Attrs.addParamAttribute( + CI->getContext(), 0, + Attribute::getWithStructRetType(CI->getContext(), ComplexStructTy)); + FunctionCallee Func = CI->getModule()->getOrInsertFunction( + Name, std::move(Attrs), Type::getVoidTy(CI->getContext()), + PointerType::get(ComplexStructTy, AllocaAS), FloatTy, FloatTy, + FloatTy, FloatTy); + + Builder.CreateCall(Func, {Alloca, LhsR, LhsI, RhsR, RhsI}); + OutReal = Builder.CreateLoad( + FloatTy, Builder.CreateStructGEP(ComplexStructTy, Alloca, 0)); + OutImag = Builder.CreateLoad( + FloatTy, Builder.CreateStructGEP(ComplexStructTy, Alloca, 1)); + break; + } + case TargetLowering::ComplexABI::Struct: { + FunctionCallee Func = CI->getModule()->getOrInsertFunction( + Name, ComplexStructTy, FloatTy, FloatTy, FloatTy, FloatTy); + Value *ComplexStructRes = + Builder.CreateCall(Func, {LhsR, LhsI, RhsR, RhsI}); + OutReal = Builder.CreateExtractValue(ComplexStructRes, 0); + OutImag = Builder.CreateExtractValue(ComplexStructRes, 1); + break; + } + } + } else { + switch (Opcode) { + case Intrinsic::experimental_complex_fmul: { + // If the target has a complex_fmul expansion and the fast-math flag + // set, use that instead of expanding. + if (TLI->CustomLowerComplexMultiply(ComplexVectorTy)) { + return false; + } + + OutReal = Builder.CreateFSub(Builder.CreateFMul(LhsR, RhsR), + Builder.CreateFMul(LhsI, RhsI)); + OutImag = Builder.CreateFAdd(Builder.CreateFMul(LhsI, RhsR), + Builder.CreateFMul(LhsR, RhsI)); + break; + } + case Intrinsic::experimental_complex_fdiv: { + Value *Scale = Builder.CreateFAdd(Builder.CreateFMul(RhsR, RhsR), + Builder.CreateFMul(RhsI, RhsI)); + OutReal = + Builder.CreateFDiv(Builder.CreateFAdd(Builder.CreateFMul(LhsR, RhsR), + Builder.CreateFMul(LhsI, RhsI)), + Scale); + OutImag = + Builder.CreateFDiv(Builder.CreateFSub(Builder.CreateFMul(LhsI, RhsR), + Builder.CreateFMul(LhsR, RhsI)), + Scale); + break; + } + } + } + + // Replace all of the uses of the intrinsic with OutReal/OutImag. We avoid + // creating the vector unless we have to. + bool HasVectorUse = false; + for (User *U : CI->users()) { + uint64_t Index; + if (match(U, m_ExtractElt(m_Value(), m_ConstantInt(Index)))) { + assert((Index == 0 || Index == 1) && "Extract element too small"); + U->replaceAllUsesWith(Index == 0 ? OutReal : OutImag); + } else { + HasVectorUse = true; + } + } + + if (HasVectorUse) { + Value *OutComplex = Builder.CreateInsertElement( + Builder.CreateInsertElement(UndefValue::get(ComplexVectorTy), OutReal, + uint64_t(0)), + OutImag, uint64_t(1)); + CI->replaceAllUsesWith(OutComplex); + } else { + CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + } + + CI->eraseFromParent(); + return true; +} + +bool expandComplexIntrinsics(Function &F, const TargetLowering *TLI) { + bool Changed = false; + SmallVector Worklist; + for (auto &I : instructions(F)) { + if (auto *II = dyn_cast(&I)) { + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::experimental_complex_fmul: + case Intrinsic::experimental_complex_fdiv: + Worklist.push_back(II); + break; + } + } + } + + const DataLayout &DL = F.getParent()->getDataLayout(); + for (auto *II : Worklist) { + Changed |= expandComplexInstruction(II, TLI, DL); + } + return Changed; +} + +class ExpandComplex : public FunctionPass { +public: + static char ID; + ExpandComplex() : FunctionPass(ID) { + initializeExpandComplexPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + const TargetMachine *TM = + &getAnalysis().getTM(); + const TargetSubtargetInfo *SubtargetInfo = TM->getSubtargetImpl(F); + const TargetLowering *TLI = SubtargetInfo->getTargetLowering(); + return expandComplexIntrinsics(F, TLI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } +}; +} // namespace + +char ExpandComplex::ID; +INITIALIZE_PASS_BEGIN(ExpandComplex, "expand-complex", + "Expand complex intrinsics", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(ExpandComplex, "expand-complex", + "Expand complex intrinsics", false, false) + +FunctionPass *llvm::createExpandComplexPass() { return new ExpandComplex(); } + +PreservedAnalyses ExpandComplexPass::run(Function &F, + FunctionAnalysisManager &AM) { + /*const auto &TTI = AM.getResult(F); + if (!expandReductions(F, &TTI)) + return PreservedAnalyses::all();*/ + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1058,6 +1058,7 @@ case ISD::USHLSAT: case ISD::ROTL: case ISD::ROTR: + case ISD::COMPLEX_MUL: SplitVecRes_BinOp(N, Lo, Hi); break; case ISD::FMA: @@ -3287,6 +3288,7 @@ case ISD::USHLSAT: case ISD::ROTL: case ISD::ROTR: + case ISD::COMPLEX_MUL: // Vector-predicated binary op widening. Note that -- unlike the // unpredicated versions -- we don't have to worry about trapping on // operations like UDIV, FADD, etc., as we pass on the original vector diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7186,6 +7186,12 @@ case Intrinsic::experimental_vector_splice: visitVectorSplice(I); return; + case Intrinsic::experimental_complex_fmul: + EVT ResultVT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + setValue(&I, DAG.getNode(ISD::COMPLEX_MUL, sdl, ResultVT, + getValue(I.getOperand(0)), + getValue(I.getOperand(1)), Flags)); + return; } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -480,6 +480,7 @@ case ISD::VECREDUCE_UMIN: return "vecreduce_umin"; case ISD::VECREDUCE_FMAX: return "vecreduce_fmax"; case ISD::VECREDUCE_FMIN: return "vecreduce_fmin"; + case ISD::COMPLEX_MUL: return "complex_mul"; // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -922,6 +922,10 @@ // Allow disabling it for testing purposes. if (!DisableExpandReductions) addPass(createExpandReductionsPass()); + + // If the target doesn't support complex intrinsics, or if they need to be + // expanded into more complex calls, generate the expansion to complex calls. + addPass(createExpandComplexPass()); } /// Turn exception handling constructs into something the code generators can diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -979,6 +979,8 @@ /// legal as the hook is used before type legalization. bool isSafeMemOpType(MVT VT) const override; + ComplexABI getComplexReturnABI(Type *ScalarFloatTy) const override; + /// Returns true if the target allows unaligned memory accesses of the /// specified type. Returns whether it is "fast" in the last argument. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment, @@ -1466,6 +1468,8 @@ unsigned getMaxSupportedInterleaveFactor() const override { return 4; } + bool CustomLowerComplexMultiply(Type *FloatTy) const override; + /// Lower interleaved load(s) into target specific /// instructions/intrinsics. bool lowerInterleavedLoad(LoadInst *LI, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2118,6 +2118,22 @@ setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); } + if (Subtarget.hasFP16()) { + for (auto VT : {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16}) { + if (Subtarget.hasVLX()) + setOperationAction(ISD::COMPLEX_MUL, VT, Custom); + setOperationAction(ISD::COMPLEX_MUL, MVT::v32f16, Custom); + } + } + if (Subtarget.hasAnyFMA() || (Subtarget.hasAVX512() && Subtarget.hasVLX())) { + for (auto VT : {MVT::v2f32, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) + setOperationAction(ISD::COMPLEX_MUL, VT, Custom); + } + if (Subtarget.hasAVX512()) { + setOperationAction(ISD::COMPLEX_MUL, MVT::v8f64, Custom); + setOperationAction(ISD::COMPLEX_MUL, MVT::v16f32, Custom); + } + if (Subtarget.hasAMXTILE()) { addRegisterClass(MVT::x86amx, &X86::TILERegClass); } @@ -2566,6 +2582,44 @@ return MVT::i32; } +TargetLoweringBase::ComplexABI +X86TargetLowering::getComplexReturnABI(Type *ScalarFloatTy) const { + // Windows ABIs don't have dedicated _Complex rules, so they work as regular + // structs. These return as integers if the size is 8 bytes or fewer, or + // structs via memory if larger. (The size threshold is the same for both + // 32 and 64-bit ABIs). + if (Subtarget.isOSWindows()) { + unsigned FloatSize = ScalarFloatTy->getPrimitiveSizeInBits().getFixedSize(); + if (FloatSize <= 32) { + return ComplexABI::Integer; + } else { + return ComplexABI::Memory; + } + } + if (Subtarget.is32Bit()) { + if (ScalarFloatTy->isFloatTy()) { + return ComplexABI::Integer; + } else if (ScalarFloatTy->isHalfTy()) { + return ComplexABI::Vector; + } else { + return ComplexABI::Memory; + } + } else { + // The x86-64 ABI specifies that (save for x86-fp80), this is handled as a + // regular C struct. This means that float and smaller get packed into a + // single vector in xmm0; double and x86-fp80 (by special case) return two + // values; and larger types than x86-fp80 (i.e., fp128) returns via memory. + unsigned FloatSize = ScalarFloatTy->getPrimitiveSizeInBits().getFixedSize(); + if (FloatSize <= 32) { + return ComplexABI::Vector; + } else if (FloatSize <= 80) { + return ComplexABI::Struct; + } else { + return ComplexABI::Memory; + } + } +} + bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) return X86ScalarSSEf32; @@ -31578,6 +31632,68 @@ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); } +bool X86TargetLowering::CustomLowerComplexMultiply(Type *FloatTy) const { + auto VecTy = cast(FloatTy); + unsigned VecSize = VecTy->getNumElements() * VecTy->getScalarSizeInBits(); + Type *ElementTy = VecTy->getElementType(); + if (ElementTy->isHalfTy()) { + // All the half type need avx512fp16 enabled. + if (VecSize == 512) + // For 512-bt vector type, just avx512fp16 needed. + return Subtarget.hasFP16(); + else + // 128-bit, 256-bit vector type are legal and other vector type can + // be widened or split. AVX512VL should be enabled. + return Subtarget.hasFP16() && Subtarget.hasVLX(); + } + if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) { + if (VecSize == 512) + // For 512-bt vector type, they are legal or can be split. + return Subtarget.hasAVX512() || Subtarget.hasAnyFMA(); + // 128-bit, 256-bit vector type are legal or and other type can + // be widened or split. + return Subtarget.hasAnyFMA() || + (Subtarget.hasAVX512() && Subtarget.hasVLX()); + } + return false; +} + +static SDValue LowerComplexMUL(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + MVT VT = Op.getSimpleValueType(); + MVT ElementTy = VT.getScalarType(); + SDLoc DL(Op); + // Custom handling for half type since we have corresponding complex half + // multiply instructions. + // FIXME: We use vfmulcph for sclar complex multiply here, use vfmulcsh + // instead. + if (ElementTy == MVT::f16) { + // Transform llvm.experimental.complex.fmul.vxf16 to vfmulcph instruction. + MVT BitCastTy = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2); + SDValue LHS = DAG.getNode(ISD::BITCAST, DL, BitCastTy, Op.getOperand(0)); + SDValue RHS = DAG.getNode(ISD::BITCAST, DL, BitCastTy, Op.getOperand(1)); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::VFMULC, DL, BitCastTy, LHS, RHS)); + } + assert((ElementTy == MVT::f32 || ElementTy == MVT::f64) && + "Unexpected element type"); + // llvm.experimental.complex.fmul.vxf{32,64} are transformed to SHUFFLE and + // FMA instructions. + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + unsigned Imm = ElementTy == MVT::SimpleValueType::f32 ? 0xb1 : 0x55; + SDValue V1, V2, V3, V4; + // Swap vcetor elements in pairs. E.g: [1,2,3,4] ---> [2,1,4,3] + V1 = DAG.getNode(X86ISD::VPERMILPI, DL, VT, LHS, + DAG.getTargetConstant(Imm, DL, MVT::i8)); + // Duplicate the odd index elements, which is real part. + V2 = DAG.getNode(X86ISD::MOVSHDUP, DL, VT, RHS); + V3 = DAG.getNode(ISD::FMUL, DL, VT, V1, V2); + // Duplicate the evem index elements, which is imaginary part. + V4 = DAG.getNode(X86ISD::MOVSLDUP, DL, VT, RHS); + return DAG.getNode(X86ISD::FMADDSUB, DL, VT, LHS, V4, V3); +} + /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -31721,6 +31837,7 @@ case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG); case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); + case ISD::COMPLEX_MUL: return LowerComplexMUL(Op, DAG, Subtarget); } } @@ -32723,6 +32840,22 @@ // to move the scalar in two i32 pieces. Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG)); return; + case ISD::COMPLEX_MUL: + // Widen the vector size smaller than 128 to 128 + MVT VT = N->getSimpleValueType(0); + // FIXME: (COMPLEX_MUL v2f16, v2f16) should be lowered to VFMULCSH but we + // mix the v2f16 and v4f16 here. + assert(VT == MVT::v2f32 || VT == MVT::v2f16 || + VT == MVT::v4f16 && "Unexpected Value type of COMPLEX_MUL!"); + MVT WideVT = + VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32; + SmallVector Ops(VT == MVT::v2f16 ? 4 : 2, DAG.getUNDEF(VT)); + Ops[0] = N->getOperand(0); + SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops); + Ops[0] = N->getOperand(1); + SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops); + Results.push_back(DAG.getNode(N->getOpcode(), dl, WideVT, LHS, RHS)); + return; } } diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -29,6 +29,7 @@ ; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics +; CHECK-NEXT: Expand complex intrinsics ; CHECK-NEXT: Expand indirectbr instructions ; CHECK-NEXT: Exception handling preparation ; CHECK-NEXT: Safe Stack instrumentation pass diff --git a/llvm/test/CodeGen/X86/complex-32bit.ll b/llvm/test/CodeGen/X86/complex-32bit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/complex-32bit.ll @@ -0,0 +1,216 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s + +; Check that we handle the ABI of the complex functions correctly for 32-bit. + +declare <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half>, <2 x half>) +declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>) +declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>) +declare <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80>, <2 x x86_fp80>) +declare <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128>, <2 x fp128>) + +define <2 x half> @intrinsic_f16(<2 x half> %z, <2 x half> %w) { +; CHECK-LABEL: intrinsic_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl %edx +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: calll __mulhc3@PLT +; CHECK-NEXT: addl $24, %esp +; CHECK-NEXT: .cfi_adjust_cfa_offset -24 +; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w) + ret <2 x half> %mul +} + +define <2 x float> @intrinsic_f32(<2 x float> %z, <2 x float> %w) { +; CHECK-LABEL: intrinsic_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $28, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps (%esp) +; CHECK-NEXT: calll __mulsc3@PLT +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: addl $28, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + + +define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) { +; CHECK-LABEL: intrinsic_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $60, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: calll __muldc3@PLT +; CHECK-NEXT: subl $4, %esp +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fxch %st(1) +; CHECK-NEXT: addl $60, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w) + ret <2 x double> %mul +} + +define <2 x x86_fp80> @intrinsic_f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) { +; CHECK-LABEL: intrinsic_f80: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $92, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%esp) +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: calll __mulxc3@PLT +; CHECK-NEXT: subl $4, %esp +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fxch %st(1) +; CHECK-NEXT: addl $92, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %mul = call <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) + ret <2 x x86_fp80> %mul +} + +define <2 x fp128> @intrinsic_f128(<2 x fp128> %z, <2 x fp128> %w) { +; CHECK-LABEL: intrinsic_f128: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 20 +; CHECK-NEXT: subl $60, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset %esi, -20 +; CHECK-NEXT: .cfi_offset %edi, -16 +; CHECK-NEXT: .cfi_offset %ebx, -12 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: subl $12, %esp +; CHECK-NEXT: .cfi_adjust_cfa_offset 12 +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: calll __multc3@PLT +; CHECK-NEXT: .cfi_adjust_cfa_offset -4 +; CHECK-NEXT: addl $76, %esp +; CHECK-NEXT: .cfi_adjust_cfa_offset -76 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, 28(%esi) +; CHECK-NEXT: movl %eax, 24(%esi) +; CHECK-NEXT: movl %ebp, 20(%esi) +; CHECK-NEXT: movl %ebx, 16(%esi) +; CHECK-NEXT: movl %edi, 12(%esi) +; CHECK-NEXT: movl %edx, 8(%esi) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 4(%esi) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, (%esi) +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: addl $60, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 20 +; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popl %edi +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl $4 + %mul = call <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128> %z, <2 x fp128> %w) + ret <2 x fp128> %mul +} + diff --git a/llvm/test/CodeGen/X86/complex-64bit.ll b/llvm/test/CodeGen/X86/complex-64bit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/complex-64bit.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Check that we handle the ABI of the complex functions correctly for 32-bit. + +declare <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half>, <2 x half>) +declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>) +declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>) +declare <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80>, <2 x x86_fp80>) +declare <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128>, <2 x fp128>) + +define <2 x half> @intrinsic_f16(<2 x half> %z, <2 x half> %w) { +; CHECK-LABEL: intrinsic_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq __mulhc3@PLT +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w) + ret <2 x half> %mul +} + +define <2 x float> @intrinsic_f32(<2 x float> %z, <2 x float> %w) { +; CHECK-LABEL: intrinsic_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; CHECK-NEXT: callq __mulsc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + +define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) { +; CHECK-LABEL: intrinsic_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-NEXT: callq __muldc3@PLT +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w) + ret <2 x double> %mul +} + +define <2 x x86_fp80> @intrinsic_f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) { +; CHECK-LABEL: intrinsic_f80: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq __mulxc3@PLT +; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) + ret <2 x x86_fp80> %mul +} + +define <2 x fp128> @intrinsic_f128(<2 x fp128> %z, <2 x fp128> %w) { +; CHECK-LABEL: intrinsic_f128: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movq %rsp, %rdi +; CHECK-NEXT: callq __multc3@PLT +; CHECK-NEXT: movaps (%rsp), %xmm0 +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128> %z, <2 x fp128> %w) + ret <2 x fp128> %mul +} + diff --git a/llvm/test/CodeGen/X86/complex-divide.ll b/llvm/test/CodeGen/X86/complex-divide.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/complex-divide.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Check the expansion of the complex divide intrinsic. This only tests +; expansion for 32-bit floats, as the expansion should produce identical IR +; expansions save for the ABI of calling __divsc3, which is tested (indirectly) +; for each type individually in complex-{32,64}bit.ll. + +declare <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float>, <2 x float>) + +; Generate a call to __divsc3 +define <2 x float> @intrinsic_slow_f32(<2 x float> %z, <2 x float> %w) { +; CHECK-LABEL: intrinsic_slow_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; CHECK-NEXT: callq __divsc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %div = call <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %div +} + +; Do not do an expansion (because fast is not sufficient to imply full +; complex-range=limited. +define <2 x float> @intrinsic_implied_not_limited_f32(<2 x float> %z, <2 x float> %w) #1 { +; CHECK-LABEL: intrinsic_implied_not_limited_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: vmovaps %xmm1, %xmm2 +; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-NEXT: callq __divsc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %div = call fast <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %div +} + +; Do an expansion (because of complex-range=limited) +define <2 x float> @intrinsic_limited_f32(<2 x float> %z, <2 x float> %w) #1 { +; CHECK-LABEL: intrinsic_limited_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vaddss {{.*}} %xmm4 +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vaddss {{.*}} %xmm5 +; CHECK-NEXT: vdivss %xmm4, %xmm5, %xmm5 +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vdivss %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[2,3] +; CHECK-NEXT: retq + %div = call <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) #0 + ret <2 x float> %div +} + +; Do an expansion, and use the FMA (because of fast-math flags). +define <2 x float> @intrinsic_fast_f32(<2 x float> %z, <2 x float> %w) #1 { +; CHECK-LABEL: intrinsic_fast_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-NEXT: vmulss %xmm3, %xmm3, %xmm4 +; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm1 * xmm1) + xmm4 +; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm5 +; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm5 = (xmm0 * xmm1) + xmm5 +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vdivss %xmm4, %xmm6, %xmm4 +; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm5 +; CHECK-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm2 * xmm1) - xmm0 +; CHECK-NEXT: vmulss %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[2,3] +; CHECK-NEXT: retq + %div = call fast <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) #0 + ret <2 x float> %div +} + +attributes #0 = { "complex-range"="limited" } +attributes #1 = { "target-features"="+fma" } +attributes #2 = { "complex-range"="no-nan" } diff --git a/llvm/test/CodeGen/X86/complex-multiply.ll b/llvm/test/CodeGen/X86/complex-multiply.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/complex-multiply.ll @@ -0,0 +1,525 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+fma | FileCheck %s --check-prefixes=ALL,FMA +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512VL + + +; Check the expansion of the complex multiply intrinsic. This only tests +; expansion for 32-bit floats, as the expansion should produce identical IR +; expansions save for ABI of calling __mulsc3, which is tested for each type +; individually in complex-{32,64}bit.ll. + +declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.experimental.complex.fmul.v4f32(<4 x float>, <4 x float>) +declare <8 x float> @llvm.experimental.complex.fmul.v8f32(<8 x float>, <8 x float>) +declare <16 x float> @llvm.experimental.complex.fmul.v16f32(<16 x float>, <16 x float>) +declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.experimental.complex.fmul.v8f64(<8 x double>, <8 x double>) +declare <6 x float> @llvm.experimental.complex.fmul.v6f32(<6 x float>, <6 x float>) +declare <6 x double> @llvm.experimental.complex.fmul.v6f64(<6 x double>, <6 x double>) +declare <32 x float> @llvm.experimental.complex.fmul.v32f32(<32 x float>, <32 x float>) + +; Generate a call to __mulsc3 +define <2 x float> @intrinsic_slow_v2f32(<2 x float> %z, <2 x float> %w) { +; ALL-LABEL: intrinsic_slow_v2f32: +; ALL: # %bb.0: +; ALL-NEXT: pushq %rax +; ALL-NEXT: .cfi_def_cfa_offset 16 +; ALL-NEXT: vmovaps %xmm1, %xmm2 +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; ALL-NEXT: callq __mulsc3@PLT +; ALL-NEXT: popq %rax +; ALL-NEXT: .cfi_def_cfa_offset 8 +; ALL-NEXT: retq + %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + +; Do an expansion (because of fast-math flags). +define <2 x float> @intrinsic_implied_limited_v2f32(<2 x float> %z, <2 x float> %w) { +; ALL-LABEL: intrinsic_implied_limited_v2f32: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; ALL-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,0,3,2] +; ALL-NEXT: vmulps %xmm2, %xmm3, %xmm2 +; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2] +; ALL-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 +; ALL-NEXT: retq + %mul = call nnan ninf <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + +; Do an expansion (because of complex-range=limited). +define <2 x float> @intrinsic_limited_v2f32(<2 x float> %z, <2 x float> %w) { +; ALL-LABEL: intrinsic_limited_v2f32: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; ALL-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,0,3,2] +; ALL-NEXT: vmulps %xmm2, %xmm3, %xmm2 +; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2] +; ALL-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 +; ALL-NEXT: retq + %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) #0 + ret <2 x float> %mul +} + +; Do an expansion, and use the FMA (because of fast-math flags). +define <2 x float> @intrinsic_fast_v2f32(<2 x float> %z, <2 x float> %w) { +; ALL-LABEL: intrinsic_fast_v2f32: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; ALL-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,0,3,2] +; ALL-NEXT: vmulps %xmm2, %xmm3, %xmm2 +; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2] +; ALL-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 +; ALL-NEXT: retq + %mul = call fast <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + +define <4 x float> @intrinsic_slow_v4f32(<4 x float> %z, <4 x float> %w) { +; ALL-LABEL: intrinsic_slow_v4f32: +; ALL: # %bb.0: +; ALL-NEXT: pushq %rax +; ALL-NEXT: .cfi_def_cfa_offset 16 +; ALL-NEXT: vmovaps %xmm1, %xmm2 +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; ALL-NEXT: callq __mulsc3@PLT +; ALL-NEXT: popq %rax +; ALL-NEXT: .cfi_def_cfa_offset 8 +; ALL-NEXT: retq + %mul = call <4 x float> @llvm.experimental.complex.fmul.v4f32(<4 x float> %z, <4 x float> %w) + ret <4 x float> %mul +} + +define <4 x float> @intrinsic_fast_v4f32(<4 x float> %z, <4 x float> %w) { +; ALL-LABEL: intrinsic_fast_v4f32: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; ALL-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,0,3,2] +; ALL-NEXT: vmulps %xmm2, %xmm3, %xmm2 +; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2] +; ALL-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 +; ALL-NEXT: retq + %mul = call fast <4 x float> @llvm.experimental.complex.fmul.v4f32(<4 x float> %z, <4 x float> %w) + ret <4 x float> %mul +} + +define <4 x float> @intrinsic_limited_v4f32(<4 x float> %z, <4 x float> %w) { +; ALL-LABEL: intrinsic_limited_v4f32: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; ALL-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,0,3,2] +; ALL-NEXT: vmulps %xmm2, %xmm3, %xmm2 +; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2] +; ALL-NEXT: vfmaddsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 +; ALL-NEXT: retq + %mul = call <4 x float> @llvm.experimental.complex.fmul.v4f32(<4 x float> %z, <4 x float> %w) #0 + ret <4 x float> %mul +} + +define <8 x float> @intrinsic_slow_v8f32(<8 x float> %z, <8 x float> %w) { +; ALL-LABEL: intrinsic_slow_v8f32: +; ALL: # %bb.0: +; ALL-NEXT: pushq %rax +; ALL-NEXT: .cfi_def_cfa_offset 16 +; ALL-NEXT: vmovaps %ymm1, %ymm2 +; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; ALL-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; ALL-NEXT: callq __mulsc3@PLT +; ALL-NEXT: popq %rax +; ALL-NEXT: .cfi_def_cfa_offset 8 +; ALL-NEXT: retq + %mul = call <8 x float> @llvm.experimental.complex.fmul.v8f32(<8 x float> %z, <8 x float> %w) + ret <8 x float> %mul +} + +define <8 x float> @intrinsic_fast_v8f32(<8 x float> %z, <8 x float> %w) { +; ALL-LABEL: intrinsic_fast_v8f32: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; ALL-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[1,0,3,2,5,4,7,6] +; ALL-NEXT: vmulps %ymm2, %ymm3, %ymm2 +; ALL-NEXT: vmovsldup {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; ALL-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 +; ALL-NEXT: retq + %mul = call fast <8 x float> @llvm.experimental.complex.fmul.v8f32(<8 x float> %z, <8 x float> %w) + ret <8 x float> %mul +} + +define <8 x float> @intrinsic_limited_v8f32(<8 x float> %z, <8 x float> %w) { +; ALL-LABEL: intrinsic_limited_v8f32: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; ALL-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[1,0,3,2,5,4,7,6] +; ALL-NEXT: vmulps %ymm2, %ymm3, %ymm2 +; ALL-NEXT: vmovsldup {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; ALL-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 +; ALL-NEXT: retq + %mul = call <8 x float> @llvm.experimental.complex.fmul.v8f32(<8 x float> %z, <8 x float> %w) #0 + ret <8 x float> %mul +} + +define <16 x float> @intrinsic_slow_v16f32(<16 x float> %z, <16 x float> %w) { +; FMA-LABEL: intrinsic_slow_v16f32: +; FMA: # %bb.0: +; FMA-NEXT: pushq %rax +; FMA-NEXT: .cfi_def_cfa_offset 16 +; FMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; FMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; FMA-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; FMA-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; FMA-NEXT: callq __mulsc3@PLT +; FMA-NEXT: popq %rax +; FMA-NEXT: .cfi_def_cfa_offset 8 +; FMA-NEXT: retq +; +; AVX512VL-LABEL: intrinsic_slow_v16f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %rax +; AVX512VL-NEXT: .cfi_def_cfa_offset 16 +; AVX512VL-NEXT: vmovaps %zmm1, %zmm2 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; AVX512VL-NEXT: callq __mulsc3@PLT +; AVX512VL-NEXT: popq %rax +; AVX512VL-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-NEXT: retq + %mul = call <16 x float> @llvm.experimental.complex.fmul.v16f32(<16 x float> %z, <16 x float> %w) + ret <16 x float> %mul +} + + +define <16 x float> @intrinsic_fast_v16f32(<16 x float> %z, <16 x float> %w) { +; FMA-LABEL: intrinsic_fast_v16f32: +; FMA: # %bb.0: +; FMA-NEXT: vmovshdup {{.*#+}} ymm4 = ymm2[1,1,3,3,5,5,7,7] +; FMA-NEXT: vpermilps {{.*#+}} ymm5 = ymm0[1,0,3,2,5,4,7,6] +; FMA-NEXT: vmulps %ymm4, %ymm5, %ymm4 +; FMA-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6] +; FMA-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 +; FMA-NEXT: vmovshdup {{.*#+}} ymm2 = ymm3[1,1,3,3,5,5,7,7] +; FMA-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,0,3,2,5,4,7,6] +; FMA-NEXT: vmulps %ymm2, %ymm4, %ymm2 +; FMA-NEXT: vmovsldup {{.*#+}} ymm3 = ymm3[0,0,2,2,4,4,6,6] +; FMA-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm2 +; FMA-NEXT: retq +; +; AVX512VL-LABEL: intrinsic_fast_v16f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512VL-NEXT: vpermilps {{.*#+}} zmm3 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX512VL-NEXT: vmulps %zmm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vmovsldup {{.*#+}} zmm1 = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; AVX512VL-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 +; AVX512VL-NEXT: retq + %mul = call fast <16 x float> @llvm.experimental.complex.fmul.v16f32(<16 x float> %z, <16 x float> %w) + ret <16 x float> %mul +} + +define <16 x float> @intrinsic_limited_v16f32(<16 x float> %z, <16 x float> %w) { +; FMA-LABEL: intrinsic_limited_v16f32: +; FMA: # %bb.0: +; FMA-NEXT: vmovshdup {{.*#+}} ymm4 = ymm2[1,1,3,3,5,5,7,7] +; FMA-NEXT: vpermilps {{.*#+}} ymm5 = ymm0[1,0,3,2,5,4,7,6] +; FMA-NEXT: vmulps %ymm4, %ymm5, %ymm4 +; FMA-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6] +; FMA-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 +; FMA-NEXT: vmovshdup {{.*#+}} ymm2 = ymm3[1,1,3,3,5,5,7,7] +; FMA-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,0,3,2,5,4,7,6] +; FMA-NEXT: vmulps %ymm2, %ymm4, %ymm2 +; FMA-NEXT: vmovsldup {{.*#+}} ymm3 = ymm3[0,0,2,2,4,4,6,6] +; FMA-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm2 +; FMA-NEXT: retq +; +; AVX512VL-LABEL: intrinsic_limited_v16f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512VL-NEXT: vpermilps {{.*#+}} zmm3 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX512VL-NEXT: vmulps %zmm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vmovsldup {{.*#+}} zmm1 = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; AVX512VL-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 +; AVX512VL-NEXT: retq + %mul = call <16 x float> @llvm.experimental.complex.fmul.v16f32(<16 x float> %z, <16 x float> %w) #0 + ret <16 x float> %mul +} + +define <2 x double> @intrinsic_slow_v2f64(<2 x double> %z, <2 x double> %w) { +; ALL-LABEL: intrinsic_slow_v2f64: +; ALL: # %bb.0: +; ALL-NEXT: pushq %rax +; ALL-NEXT: .cfi_def_cfa_offset 16 +; ALL-NEXT: vmovapd %xmm1, %xmm2 +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; ALL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; ALL-NEXT: callq __muldc3@PLT +; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; ALL-NEXT: popq %rax +; ALL-NEXT: .cfi_def_cfa_offset 8 +; ALL-NEXT: retq + %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w) + ret <2 x double> %mul +} + +define <2 x double> @intrinsic_fast_v2f64(<2 x double> %z, <2 x double> %w) { +; ALL-LABEL: intrinsic_fast_v2f64: +; ALL: # %bb.0: +; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,1] +; ALL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; ALL-NEXT: vmulpd %xmm2, %xmm3, %xmm2 +; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; ALL-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 +; ALL-NEXT: retq + %mul = call fast <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w) + ret <2 x double> %mul +} + +define <2 x double> @intrinsic_limited_v2f64(<2 x double> %z, <2 x double> %w) { +; ALL-LABEL: intrinsic_limited_v2f64: +; ALL: # %bb.0: +; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,1] +; ALL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; ALL-NEXT: vmulpd %xmm2, %xmm3, %xmm2 +; ALL-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] +; ALL-NEXT: vfmaddsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) +/- xmm2 +; ALL-NEXT: retq + %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w) #0 + ret <2 x double> %mul +} + +define <4 x double> @intrinsic_slow_v4f64(<4 x double> %z, <4 x double> %w) { +; ALL-LABEL: intrinsic_slow_v4f64: +; ALL: # %bb.0: +; ALL-NEXT: pushq %rax +; ALL-NEXT: .cfi_def_cfa_offset 16 +; ALL-NEXT: vmovapd %ymm1, %ymm2 +; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; ALL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; ALL-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; ALL-NEXT: vzeroupper +; ALL-NEXT: callq __muldc3@PLT +; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; ALL-NEXT: popq %rax +; ALL-NEXT: .cfi_def_cfa_offset 8 +; ALL-NEXT: retq + %mul = call <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double> %z, <4 x double> %w) + ret <4 x double> %mul +} + +define <4 x double> @intrinsic_fast_v4f64(<4 x double> %z, <4 x double> %w) { +; ALL-LABEL: intrinsic_fast_v4f64: +; ALL: # %bb.0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm1[1,1,3,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,3,2] +; ALL-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 +; ALL-NEXT: retq + %mul = call fast <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double> %z, <4 x double> %w) + ret <4 x double> %mul +} + +define <4 x double> @intrinsic_limited_v4f64(<4 x double> %z, <4 x double> %w) { +; ALL-LABEL: intrinsic_limited_v4f64: +; ALL: # %bb.0: +; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm1[1,1,3,3] +; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,3,2] +; ALL-NEXT: vmulpd %ymm2, %ymm3, %ymm2 +; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; ALL-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 +; ALL-NEXT: retq + %mul = call <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double> %z, <4 x double> %w) #0 + ret <4 x double> %mul +} + +define <8 x double> @intrinsic_slow_v8f64(<8 x double> %z, <8 x double> %w) { +; FMA-LABEL: intrinsic_slow_v8f64: +; FMA: # %bb.0: +; FMA-NEXT: pushq %rax +; FMA-NEXT: .cfi_def_cfa_offset 16 +; FMA-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; FMA-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; FMA-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; FMA-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; FMA-NEXT: vzeroupper +; FMA-NEXT: callq __muldc3@PLT +; FMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; FMA-NEXT: popq %rax +; FMA-NEXT: .cfi_def_cfa_offset 8 +; FMA-NEXT: retq +; +; AVX512VL-LABEL: intrinsic_slow_v8f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: pushq %rax +; AVX512VL-NEXT: .cfi_def_cfa_offset 16 +; AVX512VL-NEXT: vmovapd %zmm1, %zmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq __muldc3@PLT +; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: popq %rax +; AVX512VL-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-NEXT: retq + %mul = call <8 x double> @llvm.experimental.complex.fmul.v8f64(<8 x double> %z, <8 x double> %w) + ret <8 x double> %mul +} + +define <8 x double> @intrinsic_fast_v8f64(<8 x double> %z, <8 x double> %w) { +; FMA-LABEL: intrinsic_fast_v8f64: +; FMA: # %bb.0: +; FMA-NEXT: vpermilpd {{.*#+}} ymm4 = ymm2[1,1,3,3] +; FMA-NEXT: vpermilpd {{.*#+}} ymm5 = ymm0[1,0,3,2] +; FMA-NEXT: vmulpd %ymm4, %ymm5, %ymm4 +; FMA-NEXT: vmovddup {{.*#+}} ymm2 = ymm2[0,0,2,2] +; FMA-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 +; FMA-NEXT: vpermilpd {{.*#+}} ymm2 = ymm3[1,1,3,3] +; FMA-NEXT: vpermilpd {{.*#+}} ymm4 = ymm1[1,0,3,2] +; FMA-NEXT: vmulpd %ymm2, %ymm4, %ymm2 +; FMA-NEXT: vmovddup {{.*#+}} ymm3 = ymm3[0,0,2,2] +; FMA-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm2 +; FMA-NEXT: retq +; +; AVX512VL-LABEL: intrinsic_fast_v8f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilpd {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7] +; AVX512VL-NEXT: vpermilpd {{.*#+}} zmm3 = zmm0[1,0,3,2,5,4,7,6] +; AVX512VL-NEXT: vmulpd %zmm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vmovddup {{.*#+}} zmm1 = zmm1[0,0,2,2,4,4,6,6] +; AVX512VL-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 +; AVX512VL-NEXT: retq + %mul = call fast <8 x double> @llvm.experimental.complex.fmul.v8f64(<8 x double> %z, <8 x double> %w) + ret <8 x double> %mul +} + +define <8 x double> @intrinsic_limited_v8f64(<8 x double> %z, <8 x double> %w) { +; FMA-LABEL: intrinsic_limited_v8f64: +; FMA: # %bb.0: +; FMA-NEXT: vpermilpd {{.*#+}} ymm4 = ymm2[1,1,3,3] +; FMA-NEXT: vpermilpd {{.*#+}} ymm5 = ymm0[1,0,3,2] +; FMA-NEXT: vmulpd %ymm4, %ymm5, %ymm4 +; FMA-NEXT: vmovddup {{.*#+}} ymm2 = ymm2[0,0,2,2] +; FMA-NEXT: vfmaddsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) +/- ymm4 +; FMA-NEXT: vpermilpd {{.*#+}} ymm2 = ymm3[1,1,3,3] +; FMA-NEXT: vpermilpd {{.*#+}} ymm4 = ymm1[1,0,3,2] +; FMA-NEXT: vmulpd %ymm2, %ymm4, %ymm2 +; FMA-NEXT: vmovddup {{.*#+}} ymm3 = ymm3[0,0,2,2] +; FMA-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) +/- ymm2 +; FMA-NEXT: retq +; +; AVX512VL-LABEL: intrinsic_limited_v8f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilpd {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7] +; AVX512VL-NEXT: vpermilpd {{.*#+}} zmm3 = zmm0[1,0,3,2,5,4,7,6] +; AVX512VL-NEXT: vmulpd %zmm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vmovddup {{.*#+}} zmm1 = zmm1[0,0,2,2,4,4,6,6] +; AVX512VL-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 +; AVX512VL-NEXT: retq + %mul = call <8 x double> @llvm.experimental.complex.fmul.v8f64(<8 x double> %z, <8 x double> %w) #0 + ret <8 x double> %mul +} + +define <6 x float> @intrinsic_fast_v6f32(<6 x float> %z, <6 x float> %w) { +; ALL-LABEL: intrinsic_fast_v6f32: +; ALL: # %bb.0: +; ALL-NEXT: vmovshdup {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] +; ALL-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[1,0,3,2,5,4,7,6] +; ALL-NEXT: vmulps %ymm2, %ymm3, %ymm2 +; ALL-NEXT: vmovsldup {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; ALL-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) +/- ymm2 +; ALL-NEXT: retq + %mul = call fast <6 x float> @llvm.experimental.complex.fmul.v6f32(<6 x float> %z, <6 x float> %w) + ret <6 x float> %mul +} + +define <6 x double> @intrinsic_fast_v6f64(<6 x double> %z, <6 x double> %w) { +; FMA-LABEL: intrinsic_fast_v6f64: +; FMA: # %bb.0: +; FMA-NEXT: movq %rdi, %rax +; FMA-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; FMA-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; FMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; FMA-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm7[0] +; FMA-NEXT: vinsertf128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; FMA-NEXT: vpermilpd {{.*#+}} ymm2 = ymm0[1,0,3,2] +; FMA-NEXT: vpermilpd {{.*#+}} ymm3 = ymm1[1,1,3,3] +; FMA-NEXT: vmulpd %ymm3, %ymm2, %ymm2 +; FMA-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2] +; FMA-NEXT: vfmaddsub213pd {{.*#+}} ymm1 = (ymm0 * ymm1) +/- ymm2 +; FMA-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm4[0],xmm5[0] +; FMA-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm5[0],xmm4[0] +; FMA-NEXT: vmovapd {{[0-9]+}}(%rsp), %xmm3 +; FMA-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,1] +; FMA-NEXT: vmulpd %xmm4, %xmm2, %xmm2 +; FMA-NEXT: vmovddup {{.*#+}} xmm3 = xmm3[0,0] +; FMA-NEXT: vfmaddsub213pd {{.*#+}} xmm3 = (xmm0 * xmm3) +/- xmm2 +; FMA-NEXT: vmovapd %xmm3, 32(%rdi) +; FMA-NEXT: vmovapd %ymm1, (%rdi) +; FMA-NEXT: vzeroupper +; FMA-NEXT: retq +; +; AVX512VL-LABEL: intrinsic_fast_v6f64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilpd {{.*#+}} zmm2 = zmm1[1,1,3,3,5,5,7,7] +; AVX512VL-NEXT: vpermilpd {{.*#+}} zmm3 = zmm0[1,0,3,2,5,4,7,6] +; AVX512VL-NEXT: vmulpd %zmm2, %zmm3, %zmm2 +; AVX512VL-NEXT: vmovddup {{.*#+}} zmm1 = zmm1[0,0,2,2,4,4,6,6] +; AVX512VL-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 +; AVX512VL-NEXT: retq + %mul = call fast <6 x double> @llvm.experimental.complex.fmul.v6f64(<6 x double> %z, <6 x double> %w) + ret <6 x double> %mul +} + +; Test the vector bigger than 512 bits. +define <32 x float> @intrinsic_fast_v32f32(<32 x float> %z, <32 x float> %w) { +; FMA-LABEL: intrinsic_fast_v32f32: +; FMA: # %bb.0: +; FMA-NEXT: vmovshdup {{.*#+}} ymm8 = ymm4[1,1,3,3,5,5,7,7] +; FMA-NEXT: vpermilps {{.*#+}} ymm9 = ymm0[1,0,3,2,5,4,7,6] +; FMA-NEXT: vmulps %ymm8, %ymm9, %ymm8 +; FMA-NEXT: vmovsldup {{.*#+}} ymm4 = ymm4[0,0,2,2,4,4,6,6] +; FMA-NEXT: vfmaddsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) +/- ymm8 +; FMA-NEXT: vmovshdup {{.*#+}} ymm4 = ymm5[1,1,3,3,5,5,7,7] +; FMA-NEXT: vpermilps {{.*#+}} ymm8 = ymm1[1,0,3,2,5,4,7,6] +; FMA-NEXT: vmulps %ymm4, %ymm8, %ymm4 +; FMA-NEXT: vmovsldup {{.*#+}} ymm5 = ymm5[0,0,2,2,4,4,6,6] +; FMA-NEXT: vfmaddsub213ps {{.*#+}} ymm1 = (ymm5 * ymm1) +/- ymm4 +; FMA-NEXT: vmovshdup {{.*#+}} ymm4 = ymm6[1,1,3,3,5,5,7,7] +; FMA-NEXT: vpermilps {{.*#+}} ymm5 = ymm2[1,0,3,2,5,4,7,6] +; FMA-NEXT: vmulps %ymm4, %ymm5, %ymm4 +; FMA-NEXT: vmovsldup {{.*#+}} ymm5 = ymm6[0,0,2,2,4,4,6,6] +; FMA-NEXT: vfmaddsub213ps {{.*#+}} ymm2 = (ymm5 * ymm2) +/- ymm4 +; FMA-NEXT: vmovshdup {{.*#+}} ymm4 = ymm7[1,1,3,3,5,5,7,7] +; FMA-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[1,0,3,2,5,4,7,6] +; FMA-NEXT: vmulps %ymm4, %ymm5, %ymm4 +; FMA-NEXT: vmovsldup {{.*#+}} ymm5 = ymm7[0,0,2,2,4,4,6,6] +; FMA-NEXT: vfmaddsub213ps {{.*#+}} ymm3 = (ymm5 * ymm3) +/- ymm4 +; FMA-NEXT: retq +; +; AVX512VL-LABEL: intrinsic_fast_v32f32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm4 = zmm2[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512VL-NEXT: vpermilps {{.*#+}} zmm5 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX512VL-NEXT: vmulps %zmm4, %zmm5, %zmm4 +; AVX512VL-NEXT: vmovsldup {{.*#+}} zmm2 = zmm2[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; AVX512VL-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) +/- zmm4 +; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm2 = zmm3[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; AVX512VL-NEXT: vpermilps {{.*#+}} zmm4 = zmm1[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX512VL-NEXT: vmulps %zmm2, %zmm4, %zmm2 +; AVX512VL-NEXT: vmovsldup {{.*#+}} zmm3 = zmm3[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; AVX512VL-NEXT: vfmaddsub213ps {{.*#+}} zmm1 = (zmm3 * zmm1) +/- zmm2 +; AVX512VL-NEXT: retq + %mul = call fast <32 x float> @llvm.experimental.complex.fmul.v32f32(<32 x float> %z, <32 x float> %w) + ret <32 x float> %mul +} + +attributes #0 = { "complex-range"="limited" } diff --git a/llvm/test/CodeGen/X86/complex-win32.ll b/llvm/test/CodeGen/X86/complex-win32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/complex-win32.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-windows-msvc | FileCheck %s + +; Check that we handle the ABI of the complex functions correctly for 32-bit +; windows API. Compiler-rt only includes mulsc3/muldc3, so we only test those. + +declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>) +declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @intrinsic_f32(<2 x float> %z, <2 x float> %w) { +; CHECK-LABEL: intrinsic_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $24, %esp +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps {{[0-9]+}}(%esp) +; CHECK-NEXT: fstps (%esp) +; CHECK-NEXT: calll ___mulsc3 +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: addl $24, %esp +; CHECK-NEXT: retl + %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + +define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) { +; CHECK-LABEL: intrinsic_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $56, %esp +; CHECK-NEXT: fldl 8(%ebp) +; CHECK-NEXT: fldl 16(%ebp) +; CHECK-NEXT: fldl 24(%ebp) +; CHECK-NEXT: fldl 32(%ebp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: calll ___muldc3 +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fxch %st(1) +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: retl + %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w) + ret <2 x double> %mul +} diff --git a/llvm/test/CodeGen/X86/complex-win64.ll b/llvm/test/CodeGen/X86/complex-win64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/complex-win64.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-windows-msvc | FileCheck %s + +; Check that we handle the ABI of the complex functions correctly for 64-bit +; windows API. Compiler-rt only includes mulsc3/muldc3, so we only test those. + +declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>) +declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @intrinsic_f32(<2 x float> %z, <2 x float> %w) nounwind { +; CHECK-LABEL: intrinsic_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: movaps (%rdx), %xmm2 +; CHECK-NEXT: movaps (%rcx), %xmm0 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; CHECK-NEXT: callq __mulsc3 +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: retq + %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + +define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) nounwind { +; CHECK-LABEL: intrinsic_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movaps (%rdx), %xmm3 +; CHECK-NEXT: movaps (%rcx), %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; CHECK-NEXT: movhps %xmm3, {{[0-9]+}}(%rsp) +; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: callq __muldc3 +; CHECK-NEXT: movups {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: retq + %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w) + ret <2 x double> %mul +} diff --git a/llvm/test/CodeGen/X86/fp16-complex-multiply.ll b/llvm/test/CodeGen/X86/fp16-complex-multiply.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fp16-complex-multiply.ll @@ -0,0 +1,231 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s + +declare <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half>, <2 x half>) +declare <4 x half> @llvm.experimental.complex.fmul.v4f16(<4 x half>, <4 x half>) +declare <8 x half> @llvm.experimental.complex.fmul.v8f16(<8 x half>, <8 x half>) +declare <16 x half> @llvm.experimental.complex.fmul.v16f16(<16 x half>, <16 x half>) +declare <32 x half> @llvm.experimental.complex.fmul.v32f16(<32 x half>, <32 x half>) +declare <20 x half> @llvm.experimental.complex.fmul.v20f16(<20 x half>, <20 x half>) +declare <64 x half> @llvm.experimental.complex.fmul.v64f16(<64 x half>, <64 x half>) + +; FIXME: llvm.experimental.complex.fmul.v2f16 should be lowered to vfmulcsh +define <2 x half> @intrinsic_fast_v2f16(<2 x half> %z, <2 x half> %w) { +; CHECK-LABEL: intrinsic_fast_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %mul = call fast <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w) + ret <2 x half> %mul +} + +define <4 x half> @intrinsic_fast_v4f16(<4 x half> %z, <4 x half> %w) { +; CHECK-LABEL: intrinsic_fast_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %mul = call fast <4 x half> @llvm.experimental.complex.fmul.v4f16(<4 x half> %z, <4 x half> %w) + ret <4 x half> %mul +} + +define <8 x half> @intrinsic_fast_v8f16(<8 x half> %z, <8 x half> %w) { +; CHECK-LABEL: intrinsic_fast_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %mul = call fast <8 x half> @llvm.experimental.complex.fmul.v8f16(<8 x half> %z, <8 x half> %w) + ret <8 x half> %mul +} + +define <16 x half> @intrinsic_fast_v16f16(<16 x half> %z, <16 x half> %w) { +; CHECK-LABEL: intrinsic_fast_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %mul = call fast <16 x half> @llvm.experimental.complex.fmul.v16f16(<16 x half> %z, <16 x half> %w) + ret <16 x half> %mul +} + +define <32 x half> @intrinsic_fast_v32f16(<32 x half> %z, <32 x half> %w) { +; CHECK-LABEL: intrinsic_fast_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mul = call fast <32 x half> @llvm.experimental.complex.fmul.v32f16(<32 x half> %z, <32 x half> %w) + ret <32 x half> %mul +} + +define <20 x half> @intrinsic_fast_v20f16(<20 x half> %z, <20 x half> %w) { +; CHECK-LABEL: intrinsic_fast_v20f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mul = call fast <20 x half> @llvm.experimental.complex.fmul.v20f16(<20 x half> %z, <20 x half> %w) + ret <20 x half> %mul +} + +define <2 x half> @intrinsic_limited_v2f16(<2 x half> %z, <2 x half> %w) { +; CHECK-LABEL: intrinsic_limited_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w) #0 + ret <2 x half> %mul +} + +define <4 x half> @intrinsic_limited_v4f16(<4 x half> %z, <4 x half> %w) { +; CHECK-LABEL: intrinsic_limited_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %mul = call <4 x half> @llvm.experimental.complex.fmul.v4f16(<4 x half> %z, <4 x half> %w) #0 + ret <4 x half> %mul +} + +define <8 x half> @intrinsic_limited_v8f16(<8 x half> %z, <8 x half> %w) { +; CHECK-LABEL: intrinsic_limited_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %mul = call <8 x half> @llvm.experimental.complex.fmul.v8f16(<8 x half> %z, <8 x half> %w) #0 + ret <8 x half> %mul +} + +define <16 x half> @intrinsic_limited_v16f16(<16 x half> %z, <16 x half> %w) { +; CHECK-LABEL: intrinsic_limited_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq + %mul = call <16 x half> @llvm.experimental.complex.fmul.v16f16(<16 x half> %z, <16 x half> %w) #0 + ret <16 x half> %mul +} + +define <32 x half> @intrinsic_limited_v32f16(<32 x half> %z, <32 x half> %w) { +; CHECK-LABEL: intrinsic_limited_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mul = call <32 x half> @llvm.experimental.complex.fmul.v32f16(<32 x half> %z, <32 x half> %w) #0 + ret <32 x half> %mul +} + +define <20 x half> @intrinsic_limited_v20f16(<20 x half> %z, <20 x half> %w) { +; CHECK-LABEL: intrinsic_limited_v20f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: retq + %mul = call <20 x half> @llvm.experimental.complex.fmul.v20f16(<20 x half> %z, <20 x half> %w) #0 + ret <20 x half> %mul +} + +; Test the vector size bigger than 512 bits +define <64 x half> @intrinsic_limited_v64f16(<64 x half> %z, <64 x half> %w) { +; CHECK-LABEL: intrinsic_limited_v64f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vfmulcph %zmm2, %zmm0, %zmm4 +; CHECK-NEXT: vfmulcph %zmm3, %zmm1, %zmm2 +; CHECK-NEXT: vmovaps %zmm4, %zmm0 +; CHECK-NEXT: vmovaps %zmm2, %zmm1 +; CHECK-NEXT: retq + %mul = call <64 x half> @llvm.experimental.complex.fmul.v64f16(<64 x half> %z, <64 x half> %w) #0 + ret <64 x half> %mul +} + +define <2 x half> @intrinsic_slow_v2f16(<2 x half> %z, <2 x half> %w) { +; CHECK-LABEL: intrinsic_slow_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: vmovdqa %xmm1, %xmm2 +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm3 +; CHECK-NEXT: callq __mulhc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w) + ret <2 x half> %mul +} + +define <4 x half> @intrinsic_slow_v4f16(<4 x half> %z, <4 x half> %w) { +; CHECK-LABEL: intrinsic_slow_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: vmovdqa %xmm1, %xmm2 +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm3 +; CHECK-NEXT: callq __mulhc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <4 x half> @llvm.experimental.complex.fmul.v4f16(<4 x half> %z, <4 x half> %w) + ret <4 x half> %mul +} + +define <8 x half> @intrinsic_slow_v8f16(<8 x half> %z, <8 x half> %w) { +; CHECK-LABEL: intrinsic_slow_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: vmovdqa %xmm1, %xmm2 +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm3 +; CHECK-NEXT: callq __mulhc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <8 x half> @llvm.experimental.complex.fmul.v8f16(<8 x half> %z, <8 x half> %w) + ret <8 x half> %mul +} + +define <16 x half> @intrinsic_slow_v16f16(<16 x half> %z, <16 x half> %w) { +; CHECK-LABEL: intrinsic_slow_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: vmovdqa %ymm1, %ymm2 +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: # kill: def $xmm2 killed $xmm2 killed $ymm2 +; CHECK-NEXT: callq __mulhc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <16 x half> @llvm.experimental.complex.fmul.v16f16(<16 x half> %z, <16 x half> %w) + ret <16 x half> %mul +} + +define <32 x half> @intrinsic_slow_v32f16(<32 x half> %z, <32 x half> %w) { +; CHECK-LABEL: intrinsic_slow_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm2 +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 +; CHECK-NEXT: vpsrld $16, %xmm2, %xmm3 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: # kill: def $xmm2 killed $xmm2 killed $zmm2 +; CHECK-NEXT: callq __mulhc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <32 x half> @llvm.experimental.complex.fmul.v32f16(<32 x half> %z, <32 x half> %w) + ret <32 x half> %mul +} + +attributes #0 = { "complex-range"="limited" } diff --git a/llvm/test/Transforms/InstCombine/complex-math.ll b/llvm/test/Transforms/InstCombine/complex-math.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/complex-math.ll @@ -0,0 +1,278 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals +; RUN: opt < %s -instcombine -S -inst-combine-complex | FileCheck %s + +; Check that we match the simple expansions of complex multiplication and +; division, whether the target complex value is made by returning a struct, +; vector, or by storing into memory. + +%complex.double = type {double, double} + +define %complex.double @struct_mul(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @struct_mul( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %x = fsub double %ac, %bd + %y = fadd double %ad, %bc + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +define <2 x double> @vector_mul(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @vector_mul( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: ret <2 x double> [[TMP5]] +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %x = fsub double %ac, %bd + %y = fadd double %ad, %bc + %res = insertelement <2 x double> zeroinitializer, double %x, i32 0 + %res.1 = insertelement <2 x double> %res, double %y, i32 1 + ret <2 x double> %res.1 +} + +define void @memory_mul(double %a, double %b, double %c, double %d, %complex.double* %dest) { +; CHECK-LABEL: @memory_mul( +; CHECK-NEXT: [[DEST_REAL:%.*]] = getelementptr [[COMPLEX_DOUBLE:%.*]], %complex.double* [[DEST:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[DEST_IMAG:%.*]] = getelementptr [[COMPLEX_DOUBLE]], %complex.double* [[DEST]], i64 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: store double [[TMP6]], double* [[DEST_REAL]], align 8 +; CHECK-NEXT: store double [[TMP7]], double* [[DEST_IMAG]], align 8 +; CHECK-NEXT: ret void +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %x = fsub double %ac, %bd + %y = fadd double %ad, %bc + %dest.real = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 0 + %dest.imag = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 1 + store double %x, double* %dest.real + store double %y, double* %dest.imag + ret void +} + +define %complex.double @fast_mul(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @fast_mul( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul fast double %a, %c + %bd = fmul fast double %b, %d + %ad = fmul fast double %a, %d + %bc = fmul fast double %b, %c + %x = fsub fast double %ac, %bd + %y = fadd fast double %ad, %bc + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +define %complex.double @fastish_mul(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @fastish_mul( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call ninf <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul fast double %a, %c + %bd = fmul nnan ninf nsz double %b, %d + %ad = fmul ninf arcp contract double %a, %d + %bc = fmul reassoc nsz ninf double %b, %c + %x = fsub ninf arcp afn double %ac, %bd + %y = fadd afn nnan ninf double %ad, %bc + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +define %complex.double @struct_div(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @struct_div( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %cc = fmul double %c, %c + %dd = fmul double %d, %d + %scale = fadd double %cc, %dd + %x_noscale = fadd double %ac, %bd + %y_noscale = fsub double %bc, %ad + %x = fdiv double %x_noscale, %scale + %y = fdiv double %y_noscale, %scale + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +define <2 x double> @vector_div(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @vector_div( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: ret <2 x double> [[TMP5]] +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %cc = fmul double %c, %c + %dd = fmul double %d, %d + %scale = fadd double %cc, %dd + %x_noscale = fadd double %ac, %bd + %y_noscale = fsub double %bc, %ad + %x = fdiv double %x_noscale, %scale + %y = fdiv double %y_noscale, %scale + %res = insertelement <2 x double> zeroinitializer, double %x, i32 0 + %res.1 = insertelement <2 x double> %res, double %y, i32 1 + ret <2 x double> %res.1 +} + +define void @memory_div(double %a, double %b, double %c, double %d, %complex.double* %dest) { +; CHECK-LABEL: @memory_div( +; CHECK-NEXT: [[DEST_REAL:%.*]] = getelementptr [[COMPLEX_DOUBLE:%.*]], %complex.double* [[DEST:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[DEST_IMAG:%.*]] = getelementptr [[COMPLEX_DOUBLE]], %complex.double* [[DEST]], i64 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: store double [[TMP6]], double* [[DEST_REAL]], align 8 +; CHECK-NEXT: store double [[TMP7]], double* [[DEST_IMAG]], align 8 +; CHECK-NEXT: ret void +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %cc = fmul double %c, %c + %dd = fmul double %d, %d + %scale = fadd double %cc, %dd + %x_noscale = fadd double %ac, %bd + %y_noscale = fsub double %bc, %ad + %x = fdiv double %x_noscale, %scale + %y = fdiv double %y_noscale, %scale + %dest.real = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 0 + %dest.imag = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 1 + store double %x, double* %dest.real + store double %y, double* %dest.imag + ret void +} + +define %complex.double @fast_div(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @fast_div( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul fast double %a, %c + %bd = fmul fast double %b, %d + %ad = fmul fast double %a, %d + %bc = fmul fast double %b, %c + %cc = fmul fast double %c, %c + %dd = fmul fast double %d, %d + %scale = fadd fast double %cc, %dd + %x_noscale = fadd fast double %ac, %bd + %y_noscale = fsub fast double %bc, %ad + %x = fdiv fast double %x_noscale, %scale + %y = fdiv fast double %y_noscale, %scale + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +define %complex.double @fastish_div(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @fastish_div( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call arcp <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul arcp contract double %a, %c + %bd = fmul arcp afn ninf reassoc double %b, %d + %ad = fmul arcp afn ninf double %a, %d + %bc = fmul arcp nsz reassoc double %b, %c + %cc = fmul arcp nsz afn double %c, %c + %dd = fmul arcp nsz double %d, %d + %scale = fadd arcp nsz contract nnan reassoc double %cc, %dd + %x_noscale = fadd arcp nsz contract ninf nnan double %ac, %bd + %y_noscale = fsub arcp nsz contract reassoc double %bc, %ad + %x = fdiv arcp ninf nnan reassoc double %x_noscale, %scale + %y = fdiv arcp nnan double %y_noscale, %scale + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nofree nosync nounwind readnone willreturn } +; CHECK: attributes #[[ATTR1]] = { "complex-range"="limited" } +;.