diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -185,6 +185,8 @@ /// float-to-int conversion instructions. CODEGENOPT(StrictFloatCastOverflow, 1, 1) +CODEGENOPT(UseComplexIntrinsics, 1, 0) ///< Use LLVM complex intrinsics + CODEGENOPT(UniformWGSize , 1, 0) ///< -cl-uniform-work-group-size CODEGENOPT(NoZeroInitializedInBSS , 1, 0) ///< -fno-zero-initialized-in-bss. /// Method of Objective-C dispatch to use. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1816,6 +1816,13 @@ "floating-point expressions are evaluated">, NegFlag>; +defm use_complex_intrinsics : BoolFOption<"use-complex-intrinsics", + CodeGenOpts<"UseComplexIntrinsics">, DefaultFalse, + PosFlag, + NegFlag>; + def ffor_scope : Flag<["-"], "ffor-scope">, Group; def fno_for_scope : Flag<["-"], "fno-for-scope">, Group; diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp --- a/clang/lib/CodeGen/CGExprComplex.cpp +++ b/clang/lib/CodeGen/CGExprComplex.cpp @@ -88,6 +88,13 @@ ComplexPairTy EmitScalarToComplexCast(llvm::Value *Val, QualType SrcType, QualType DestType, SourceLocation Loc); + /// Convert a LLVM complex type representation into a pair of values + ComplexPairTy BreakLLVMComplexType(llvm::Value *Val) { + llvm::Value *Real = Builder.CreateExtractElement(Val, uint64_t(0)); + llvm::Value *Imag = Builder.CreateExtractElement(Val, uint64_t(1)); + return ComplexPairTy(Real, Imag); + } + //===--------------------------------------------------------------------===// // Visitor Methods //===--------------------------------------------------------------------===// @@ -701,6 +708,14 @@ // still more of this within the type system. if (Op.LHS.second && Op.RHS.second) { + if (CGF.CGM.getCodeGenOpts().UseComplexIntrinsics) { + Value *Op0 = Builder.CreateComplexValue(Op.LHS.first, Op.LHS.second); + Value *Op1 = Builder.CreateComplexValue(Op.RHS.first, Op.RHS.second); + // TODO: Support STDC CX_LIMITED_RANGE here. + Value *Result = Builder.CreateComplexMul(Op0, Op1, false); + return BreakLLVMComplexType(Result); + } + // If both operands are complex, emit the core math directly, and then // test for NaNs. If we find NaNs in the result, we delegate to a libcall // to carefully re-compute the correct infinity representation if @@ -794,6 +809,19 @@ llvm::Value *DSTr, *DSTi; if (LHSr->getType()->isFloatingPointTy()) { + // If we are using complex intrinsics, do so whenever the right-hand side + // is complex, since no major simplification is possible in this scenario. + // (Simplifications are possible if the LHS is real or pure imaginary). + if (CGF.CGM.getCodeGenOpts().UseComplexIntrinsics && RHSi) { + llvm::Value *Op0 = + Builder.CreateComplexValue(Op.LHS.first, Op.LHS.second); + llvm::Value *Op1 = + Builder.CreateComplexValue(Op.RHS.first, Op.RHS.second); + // TODO: Support STDC CX_LIMITED_RANGE here. + llvm::Value *Result = Builder.CreateComplexDiv(Op0, Op1, false); + return BreakLLVMComplexType(Result); + } + // If we have a complex operand on the RHS and FastMath is not allowed, we // delegate to a libcall to handle all of the complexities and minimize // underflow/overflow cases. When FastMath is allowed we construct the diff --git a/clang/test/CodeGen/complex-intrinsics.c b/clang/test/CodeGen/complex-intrinsics.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/complex-intrinsics.c @@ -0,0 +1,180 @@ +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -o - | FileCheck %s --check-prefix=NOINTRIN +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-pc-win64 -o - | FileCheck %s --check-prefix=NOINTRIN +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple i686-unknown-unknown -o - | FileCheck %s --check-prefix=NOINTRIN +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple spir -o - | FileCheck %s --check-prefix=NOINTRIN +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -fuse-complex-intrinsics -o - | FileCheck %s --check-prefix=INTRIN +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-pc-win64 -fuse-complex-intrinsics -o - | FileCheck %s --check-prefix=INTRIN +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -fuse-complex-intrinsics -DT=int -o - | FileCheck %s --check-prefix=INT +// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -DT=int -o - | FileCheck %s --check-prefix=INT + +#ifndef T +#define T float +#endif + +T check_var; +// INTRIN: @check_var = global [[T:[a-z0-9]+]] +// NOINTRIN: @check_var = global [[T:[a-z0-9]+]] +// INT: @check_var = global [[T:i[0-9]+]] + +T _Complex add_rc(T a, T _Complex b) { + // INTRIN-LABEL: @add_rc( + // INTRIN-COUNT-1: fadd [[T]] + // INTRIN: ret + // NOINTRIN-LABEL: @add_rc( + // NOINTRIN-COUNT-1: fadd [[T]] + // NOINTRIN: ret + // INT-LABEL: @add_rc( + // INT-COUNT-1: add [[T]] + // INT: ret + return a + b; +} + +T _Complex add_cr(T _Complex a, T b) { + // INTRIN-LABEL: @add_cr( + // INTRIN-COUNT-1: fadd [[T]] + // INTRIN: ret + // NOINTRIN-LABEL: @add_cr( + // NOINTRIN-COUNT-1: fadd [[T]] + // NOINTRIN: ret + // INT-LABEL: @add_cr( + // INT-COUNT-1: add [[T]] + // INT: ret + return a + b; +} + +T _Complex add_cc(T _Complex a, T _Complex b) { + // INTRIN-LABEL: @add_cc( + // INTRIN-COUNT-2: fadd [[T]] + // INTRIN: ret + // NOINTRIN-LABEL: @add_cc( + // NOINTRIN-COUNT-2: fadd [[T]] + // NOINTRIN: ret + // INT-LABEL: @add_cc( + // INT-COUNT-2: add [[T]] + // INT: ret + return a + b; +} + +T _Complex sub_rc(T a, T _Complex b) { + // INTRIN-LABEL: @sub_rc( + // INTRIN: fsub [[T]] + // INTRIN: fneg [[T]] + // INTRIN: ret + // NOINTRIN-LABEL: @sub_rc( + // NOINTRIN: fsub [[T]] + // NOINTRIN: fneg [[T]] + // NOINTRIN: ret + // INT-LABEL: @sub_rc( + // INT-COUNT-2: sub [[T]] + // INT: ret + return a - b; +} + +T _Complex sub_cr(T _Complex a, T b) { + // INTRIN-LABEL: @sub_cr( + // INTRIN: fsub [[T]] + // INTRIN-NOT: fsub [[T]] + // INTRIN: ret + // NOINTRIN-LABEL: @sub_cr( + // NOINTRIN: fsub [[T]] + // NOINTRIN-NOT: fsub [[T]] + // NOINTRIN: ret + // INT-LABEL: @sub_cr( + // INT-COUNT-2: sub [[T]] + // INT: ret + return a - b; +} + +T _Complex sub_cc(T _Complex a, T _Complex b) { + // INTRIN-LABEL: @sub_cc( + // INTRIN-COUNT-2: fsub [[T]] + // INTRIN: ret + // NOINTRIN-LABEL: @sub_cc( + // NOINTRIN-COUNT-2: fsub [[T]] + // NOINTRIN: ret + // INT-LABEL: @sub_cc( + // INT-COUNT-2: sub [[T]] + // INT: ret + return a - b; +} + +T _Complex mul_rc(T a, T _Complex b) { + // INTRIN-LABEL: @mul_rc( + // INTRIN-COUNT-2: fmul [[T]] + // INTRIN: ret + // NOINTRIN-LABEL: @mul_rc( + // NOINTRIN-COUNT-2: fmul [[T]] + // NOINTRIN: ret + // INT-LABEL: @mul_rc( + // INT-COUNT-4: mul [[T]] + // INT: ret + return a * b; +} + +T _Complex mul_cr(T _Complex a, T b) { + // INTRIN-LABEL: @mul_cr( + // INTRIN-COUNT-2: fmul [[T]] + // INTRIN: ret + // NOINTRIN-LABEL: @mul_cr( + // NOINTRIN-COUNT-2: fmul [[T]] + // NOINTRIN: ret + // INT-LABEL: @mul_cr( + // INT-COUNT-4: mul [[T]] + // INT: ret + return a * b; +} + +T _Complex mul_cc(T _Complex a, T _Complex b) { + // INTRIN-LABEL: @mul_cc( + // INTRIN-NOT: fmul [[T]] + // INTRIN: call {{.*}} @llvm.experimental.complex.fmul + // INTRIN: ret + // NOINTRIN-LABEL: @mul_cc( + // NOINTRIN-COUNT-4: fmul [[T]] + // NOINTRIN: ret + // INT-LABEL: @mul_cc( + // INT-COUNT-4: mul [[T]] + // INT: ret + return a * b; +} + +T _Complex div_rc(T a, T _Complex b) { + // INTRIN-LABEL: @div_rc( + // INTRIN-NOT: fdiv [[T]] + // INTRIN: call {{.*}} @llvm.experimental.complex.fdiv + // INTRIN: ret + // NOINTRIN-LABEL: @div_rc( + // NOINTRIN: call {{.*}} @__div + // NOINTRIN: ret + // INT-LABEL: @div_rc( + // INT-COUNT-6: mul [[T]] + // INT: ret + return a / b; +} + +T _Complex div_cr(T _Complex a, T b) { + // INTRIN-LABEL: @div_cr( + // INTRIN-COUNT-2: fdiv [[T]] + // INTRIN: ret + // NOINTRIN-LABEL: @div_cr( + // NOINTRIN-COUNT-2: fdiv [[T]] + // NOINTRIN: ret + // INT-LABEL: @div_cr( + // INT-COUNT-5: mul [[T]] + // INT: ret + return a / b; +} + +T _Complex div_cc(T _Complex a, T _Complex b) { + // INTRIN-LABEL: @div_cc( + // INTRIN-NOT: fdiv [[T]] + // INTRIN: call {{.*}} @llvm.experimental.complex.fdiv + // INTRIN: ret + // NOINTRIN-LABEL: @div_cc( + // NOINTRIN: call {{.*}} @__div + // NOINTRIN: ret + // INT-LABEL: @div_cc( + // INT-COUNT-6: mul [[T]] + // INT: ret + return a / b; +} diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -17238,6 +17238,167 @@ None. +Complex Intrinsics +------------------ + +Complex numbers are currently represented, for intrinsic purposes, as vectors of +floating-point numbers. A scalar complex type is represented using the type +``<2 x floatty>``, with index ``0`` corresponding to the real part of the number +and index ``1`` corresponding the imaginary part of the number. A vector complex +type can be represented by an even-length vector of floating-point numbers, +with even indices (``0``, ``2``, etc.) corresponding to real parts of numbers +and the indices one larger (``1``, ``3``, etc.) the corresponding imaginary +parts. + +In general, these intrinsics have the same semantics as their definitions in +Annex G of the C specification. In particular, this means that multiplication, +division, and absolute value cannot be represented with their regular algebraic +formulas, as this would produce a NaN value instead of an infinity, or an +intermediate value may overflow. However, adding the ``complex-limited-range`` +attribute to the call-site will specifically request the regular algebraic +formula. + +In addition to the ``complex-limited-range`` attribute, these intrinsics also +respect the fast math flags. These flags will be applied to all of the +floating-point expressions implied by the intrinsic if it were expanded. In +particular, the ``nnan`` or ``ninf`` flags are sufficient to cancel out all of +the recalculations when an input is NaN. + +Another attribute, ``complex-no-scale``, applies to the division intrinsic. This +attribute allows the operation to be calculated according to the regular +algebraic formula without regard for needing to scale the bounds first to +prevent a potentially spurious overflow. It will still retain the checks for +infinity as converted to NaN (that can be removed by ``complex-limited-range``). + +Intrinsics for complex addition and subtraction are not provided, as these are +equivalent to ``fadd`` and ``fsub`` instructions, respectively. + +'``llvm.experimental.complex.fmul.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> , <2 x float> ) + declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> , <2 x double> ) + declare <4 x float> @llvm.experimental.complex.fmul.v4f32(<4 x float> , <4 x float> ) + +Overview: +""""""""" + +The '``llvm.experimental.complex.fmul``' intrinsic returns the product of its +two operands. + +Arguments: +"""""""""" + +The arguments to the '``llvm.experimental.complex.fmul``' intrinsic must be a +:ref:`vector ` of :ref:`floating-point ` types of length +divisible by 2. + +Semantics: +"""""""""" + +The value produced is the complex product of the two inputs. + +If the ``complex-limited-range`` attribute is provided, or the ``noinf`` or +``nonan`` fast math flags are provided, the output may be equivalent to the +following code: + +.. code-block:: llvm + + declare <2 x float> limited_complex_mul(<2 x float> %op1, <2 x float> %op2) { + %x = extractelement <2 x float> %op1, i32 0 ; real of %op1 + %y = extractelement <2 x float> %op1, i32 1 ; imag of %op1 + %u = extractelement <2 x float> %op2, i32 0 ; real of %op2 + %v = extractelement <2 x float> %op2, i32 1 ; imag of %op2 + %xu = fmul float %x, %u + %yv = fmul float %y, %v + %yu = fmul float %y, %u + %xv = fmul float %x, %v + %out_real = fsub float %xu, %yv + %out_imag = fadd float %yu, %xv + %ret.0 = insertelement <2 x float> undef, i32 0, %out_real + %ret.1 = insertelement <2 x float> %ret.0, i32 1, %out_imag + return <2 x float> %ret.1 + } + +Without the ``complex-limited-range`` flag and other fast-math flags, the above +code is insufficient to handle the result. Instead, code must be added to +check for infinities if either the real or imaginary component of the result is +a NaN value. + + +'``llvm.experimental.complex.fdiv.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> , <2 x float> ) + declare <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> , <2 x double> ) + declare <4 x float> @llvm.experimental.complex.fdiv.v4f32(<4 x float> , <4 x float> ) + +Overview: +""""""""" + +The '``llvm.experimental.complex.fdiv``' intrinsic returns the quotient of its +two operands. + +Arguments: +"""""""""" + +The arguments to the '``llvm.experimental.complex.fdiv``' intrinsic must be a +:ref:`vector ` of :ref:`floating-point ` types of length +divisible by 2. + +Semantics: +"""""""""" + +The value produced is the complex quotient of the two inputs. + +If the ``complex-limited-range`` attribute is provided, the output will be +equivalent to the following code: + +.. code-block:: llvm + + declare <2 x float> limited_complex_div(<2 x float> %op1, <2 x float> %op2) { + %x = extractelement <2 x float> %op1, i32 0 ; real of %op1 + %y = extractelement <2 x float> %op1, i32 1 ; imag of %op1 + %u = extractelement <2 x float> %op2, i32 0 ; real of %op2 + %v = extractelement <2 x float> %op2, i32 1 ; imag of %op2 + %xu = fmul float %x, %u + %yv = fmul float %y, %v + %yu = fmul float %y, %u + %xv = fmul float %x, %v + %uu = fmul float %u, %u + %vv = fmul float %v, %v + %unscaled_real = fadd float %xu, %yv + %unscaled_imag = fsub float %yu, %xv + %scale = fadd float %uu, %vv + %out_real = fdiv float %unscaled_real, %scale + %out_imag = fdiv float %unscaled_imag, %scale + %ret.0 = insertelement <2 x float> undef, i32 0, %out_real + %ret.1 = insertelement <2 x float> %ret.0, i32 1, %out_imag + return <2 x float> %ret.1 + } + +Without the ``complex-limited-range`` attribute, the above code would be an +insufficient implementation. Instead, code is needed to scale the input values +to prevent potential overflow; this is true even if the ``nnan`` and ``ninf`` +flags are specified. The ``arcp`` fast math flag may also be useful, as it will +permit the divisions to be replaced with multiplications with a reciprocal +instead. + +The ``complex-no-scale`` attribute (implied by ``complex-limited-range``) can be +used to eliminate the necessary scaling requirements. + Matrix Intrinsics ----------------- diff --git a/llvm/include/llvm/CodeGen/ExpandComplex.h b/llvm/include/llvm/CodeGen/ExpandComplex.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/CodeGen/ExpandComplex.h @@ -0,0 +1,25 @@ +//===---- ExpandComplex.h - Expand experimental complex intrinsics --------===// +// +// Copyright (C) 2021 Intel Corporation. All rights reserved. +// +// The information and source code contained herein is the exclusive +// property of Intel Corporation and may not be disclosed, examined +// or reproduced in whole or in part without explicit written authorization +// from the company. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_EXPANDCOMPLEX_H +#define LLVM_CODEGEN_EXPANDCOMPLEX_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class ExpandComplexPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_CODEGEN_EXPANDCOMPLEX_H diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h --- a/llvm/include/llvm/CodeGen/Passes.h +++ b/llvm/include/llvm/CodeGen/Passes.h @@ -473,6 +473,10 @@ /// printing assembly. ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions = true); + /// This pass expands the experimental complex intrinsics into regular + /// floating-point arithmetic or calls to __mulsc3 (or similar) functions. + FunctionPass *createExpandComplexPass(); + /// This pass expands the experimental reduction intrinsics into sequences of /// shuffles. FunctionPass *createExpandReductionsPass(); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -647,6 +647,23 @@ return false; } + /// Enum that specifies how a C complex type is lowered (in LLVM type terms). + enum class ComplexABI { + Memory, ///< Indicates that a pointer to the struct is passed. + Vector, ///< Indicates that T _Complex can be passed as <2 x T>. + Struct, ///< Indicates that T _Complex can be passed as {T, T}. + }; + + /// Returns how a C complex type is lowered when used as the return value. + virtual ComplexABI getComplexReturnABI(Type *ScalarFloatTy) const { + return ComplexABI::Struct; + } + + /// Returns true if the target can match the @llvm.intel.complex.fmul + /// intrinsic with the given type. Such an intrinsic is assumed will only be + /// matched when "complex-limited-range" is in effect. + virtual bool hasComplexMultiply(Type *FloatTy) const { return false; } + /// Return if the target supports combining a /// chain like: /// \code diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td --- a/llvm/include/llvm/IR/Attributes.td +++ b/llvm/include/llvm/IR/Attributes.td @@ -301,6 +301,18 @@ def ProfileSampleAccurate : StrBoolAttr<"profile-sample-accurate">; def UseSampleProfile : StrBoolAttr<"use-sample-profile">; +/// This range indicates that complex multiply, division, and absolute value +/// expressions can be simplified to their trivial mathematical expressions. It +/// has an equivalent effect to specifying the STDC CX_LIMITED_RANGE pragma in +/// C complex arithmetic code. +def ComplexLimitedRange : StrBoolAttr<"complex-limited-range">; + +/// This range indicates that complex division expressions do not need to have +/// any pre-scaling of their values before doing arithmetic and can instead use +/// the trivial mathematical expression to be computed. Unlike +/// complex-limited-range, however, the NaN processing is still required. +def ComplexNoScale : StrBoolAttr<"complex-no-scale">; + class CompatRule { // The name of the function called to check the attribute of the caller and // callee and decide whether inlining should be allowed. The function's diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -1636,6 +1636,34 @@ Value *CreateNAryOp(unsigned Opc, ArrayRef Ops, const Twine &Name = "", MDNode *FPMathTag = nullptr); + /// Construct a complex value out of a pair of real and imaginary values. + /// The resulting value will be a vector, with lane 0 being the real value and + /// lane 1 being the complex value. + /// Either the \p Real or \p Imag parameter may be null, if the input is a + /// pure real or pure imaginary number. + Value *CreateComplexValue(Value *Real, Value *Imag, const Twine &Name = "") { + Type *ScalarTy = (Real ? Real : Imag)->getType(); + assert(ScalarTy->isFloatingPointTy() && + "Only floating-point types may be complex values."); + Type *ComplexTy = FixedVectorType::get(ScalarTy, 2); + Value *Base = PoisonValue::get(ComplexTy); + if (Real) + Base = CreateInsertElement(Base, Real, uint64_t(0), Name); + if (Imag) + Base = CreateInsertElement(Base, Imag, uint64_t(1), Name); + return Base; + } + + /// Construct a complex multiply operation, setting fast-math flags and the + /// complex-limited-range attribute as appropriate. + Value *CreateComplexMul(Value *L, Value *R, bool CxLimitedRange, + const Twine &Name = ""); + + /// Construct a complex divide operation, setting fast-math flags and the + /// complex-limited-range and complex-no-scale attributes as appropriate. + Value *CreateComplexDiv(Value *L, Value *R, bool CxLimitedRange, + bool CxNoScale = false, const Twine &Name = ""); + //===--------------------------------------------------------------------===// // Instruction creation methods: Memory Instructions //===--------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1730,6 +1730,16 @@ [llvm_anyvector_ty]>; } +//===----- Complex math intrinsics ----------------------------------------===// + +def int_experimental_complex_fmul: DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>,LLVMMatchType<0>], + [IntrNoMem]>; + +def int_experimental_complex_fdiv: DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>,LLVMMatchType<0>], + [IntrNoMem]>; + //===----- Matrix intrinsics ---------------------------------------------===// def int_matrix_transpose diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -153,6 +153,7 @@ void initializeEHContGuardCatchretPass(PassRegistry &); void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&); void initializeEntryExitInstrumenterPass(PassRegistry&); +void initializeExpandComplexPass(PassRegistry &); void initializeExpandMemCmpPassPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); void initializeExpandReductionsPass(PassRegistry&); diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt --- a/llvm/lib/CodeGen/CMakeLists.txt +++ b/llvm/lib/CodeGen/CMakeLists.txt @@ -26,6 +26,7 @@ EdgeBundles.cpp EHContGuardCatchret.cpp ExecutionDomainFix.cpp + ExpandComplex.cpp ExpandMemCmp.cpp ExpandPostRAPseudos.cpp ExpandReductions.cpp diff --git a/llvm/lib/CodeGen/ExpandComplex.cpp b/llvm/lib/CodeGen/ExpandComplex.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/CodeGen/ExpandComplex.cpp @@ -0,0 +1,282 @@ +//===-- ExpandComplex.cpp - Expand experimental complex intrinsics --------===// +// +// Copyright (C) 2021 Intel Corporation. All rights reserved. +// +// The information and source code contained herein is the exclusive +// property of Intel Corporation and may not be disclosed, examined +// or reproduced in whole or in part without explicit written authorization +// from the company. +// +//===----------------------------------------------------------------------===// +// +// This pass implements IR expansion for complex intrinsics, allowing targets +// to enable the intrinsics until just before codegen. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ExpandComplex.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; +using namespace llvm::PatternMatch; + +namespace { + +bool expandComplexInstruction(IntrinsicInst *CI, const TargetLowering *TLI, + const DataLayout &DL) { + Intrinsic::ID Opcode = CI->getIntrinsicID(); + assert((Opcode == Intrinsic::experimental_complex_fmul || + Opcode == Intrinsic::experimental_complex_fdiv) && + "Expected a complex instruction"); + + // Break the input values up into real and imaginary pieces. + Type *ComplexVectorTy = CI->getArgOperand(0)->getType(); + Type *FloatTy = ComplexVectorTy->getScalarType(); + IRBuilder<> Builder(CI); + Builder.setFastMathFlags(CI->getFastMathFlags()); + Value *LhsR = Builder.CreateExtractElement(CI->getArgOperand(0), uint64_t(0)); + Value *LhsI = Builder.CreateExtractElement(CI->getArgOperand(0), uint64_t(1)); + Value *RhsR = nullptr, *RhsI = nullptr; + RhsR = Builder.CreateExtractElement(CI->getArgOperand(1), uint64_t(0)); + RhsI = Builder.CreateExtractElement(CI->getArgOperand(1), uint64_t(1)); + + // The expansion has three pieces: the naive arithmetic, a possible prescaling + // (not relevant for multiplication), and a step to convert NaN output values + // to infinity values in certain situations (see Annex G of the C + // specification for more details). + // + // For now, we use the compiler-rt function directly if we need either of the + // latter two pieces; otherwise, we do the expansion manually here. + Value *OutReal, *OutImag; + bool CanExpand = false; + // Complex-limited-range explicitly erquests only the naive arithmetic step. + if (CI->hasFnAttr("complex-limited-range")) + CanExpand = true; + else { + // The NaN check is essentially structured as + // if (isnan(result_real) && isnan(result_imag)) { + // if (isinf(a) || isinf(b)) { /* several statements like this */ } + // } + // Therefore, setting one of nonan or noinf alone is sufficient to disable + // the recalculation check, nonan by disabling the outer if statement, and + // noinf by disabling the inner if statements (making the outer one empty). + bool SkipNaNCheck = + CI->getFastMathFlags().noNaNs() || CI->getFastMathFlags().noInfs(); + bool HasScale = Opcode != Intrinsic::experimental_complex_fmul && + !CI->hasFnAttr("complex-no-scale"); + CanExpand = SkipNaNCheck && !HasScale; + } + if (!CanExpand) { + // Do a call directly to the compiler-rt library here. + const char *Name = nullptr; + if (Opcode == Intrinsic::experimental_complex_fmul) { + if (FloatTy->isHalfTy()) + Name = "__mulhc3"; + else if (FloatTy->isFloatTy()) + Name = "__mulsc3"; + else if (FloatTy->isDoubleTy()) + Name = "__muldc3"; + else if (FloatTy->isX86_FP80Ty()) + Name = "__mulxc3"; + else if (FloatTy->isFP128Ty() || FloatTy->isPPC_FP128Ty()) + Name = "__multc3"; + } else if (Opcode == Intrinsic::experimental_complex_fdiv) { + if (FloatTy->isHalfTy()) + Name = "__divhc3"; + else if (FloatTy->isFloatTy()) + Name = "__divsc3"; + else if (FloatTy->isDoubleTy()) + Name = "__divdc3"; + else if (FloatTy->isX86_FP80Ty()) + Name = "__divxc3"; + else if (FloatTy->isFP128Ty() || FloatTy->isPPC_FP128Ty()) + Name = "__divtc3"; + } + + if (!Name) + report_fatal_error("Cannot find libcall for intrinsic"); + + // The function we are to call is T complex __name(T, T, T, T) in C terms. + // Use TLI to figure out what the appropriate actual ABI for this function. + StructType *ComplexStructTy = StructType::get(FloatTy, FloatTy); + switch (TLI->getComplexReturnABI(FloatTy)) { + case TargetLowering::ComplexABI::Vector: { + // When the result is a vector type directly, we can replace the intrinsic + // with the call to the underlying function without any other munging. + FunctionCallee Func = CI->getModule()->getOrInsertFunction( + Name, ComplexVectorTy, FloatTy, FloatTy, FloatTy, FloatTy); + Value *NewResult = Builder.CreateCall(Func, {LhsR, LhsI, RhsR, RhsI}); + CI->replaceAllUsesWith(NewResult); + CI->eraseFromParent(); + return true; + } + case TargetLowering::ComplexABI::Memory: { + // Allocate a struct for the return type in the entry block. Stack slot + // coloring should remove duplicate allocations. + unsigned AllocaAS = DL.getAllocaAddrSpace(); + Value *Alloca; + { + IRBuilderBase::InsertPointGuard Guard(Builder); + BasicBlock *EntryBB = &CI->getParent()->getParent()->getEntryBlock(); + Builder.SetInsertPoint(EntryBB, EntryBB->begin()); + Alloca = Builder.CreateAlloca(ComplexStructTy, AllocaAS); + } + + AttributeList Attrs; + AttrBuilder AB(Attrs, 0); + AB.addStructRetAttr(ComplexStructTy); + FunctionCallee Func = CI->getModule()->getOrInsertFunction( + Name, Attrs, Type::getVoidTy(CI->getContext()), + PointerType::get(ComplexStructTy, AllocaAS), FloatTy, FloatTy, + FloatTy, FloatTy); + + Builder.CreateCall(Func, {Alloca, LhsR, LhsI, RhsR, RhsI}); + OutReal = Builder.CreateLoad( + FloatTy, Builder.CreateStructGEP(ComplexStructTy, Alloca, 0)); + OutImag = Builder.CreateLoad( + FloatTy, Builder.CreateStructGEP(ComplexStructTy, Alloca, 1)); + break; + } + case TargetLowering::ComplexABI::Struct: { + FunctionCallee Func = CI->getModule()->getOrInsertFunction( + Name, ComplexStructTy, FloatTy, FloatTy, FloatTy, FloatTy); + Value *ComplexStructRes = + Builder.CreateCall(Func, {LhsR, LhsI, RhsR, RhsI}); + OutReal = Builder.CreateExtractValue(ComplexStructRes, 0); + OutImag = Builder.CreateExtractValue(ComplexStructRes, 1); + break; + } + } + } else { + switch (Opcode) { + case Intrinsic::experimental_complex_fmul: { + // If the target has a complex_fmul expansion, use that instead of + // expanding. + if (TLI->hasComplexMultiply(FloatTy)) + return false; + + OutReal = Builder.CreateFSub(Builder.CreateFMul(LhsR, RhsR), + Builder.CreateFMul(LhsI, RhsI)); + OutImag = Builder.CreateFAdd(Builder.CreateFMul(LhsI, RhsR), + Builder.CreateFMul(LhsR, RhsI)); + break; + } + case Intrinsic::experimental_complex_fdiv: { + Value *Scale = Builder.CreateFAdd(Builder.CreateFMul(RhsR, RhsR), + Builder.CreateFMul(RhsI, RhsI)); + OutReal = + Builder.CreateFDiv(Builder.CreateFAdd(Builder.CreateFMul(LhsR, RhsR), + Builder.CreateFMul(LhsI, RhsI)), + Scale); + OutImag = + Builder.CreateFDiv(Builder.CreateFSub(Builder.CreateFMul(LhsI, RhsR), + Builder.CreateFMul(LhsR, RhsI)), + Scale); + break; + } + } + } + + // Replace all of the uses of the intrinsic with OutReal/OutImag. We avoid + // creating the vector unless we have to. + bool HasVectorUse = false; + for (User *U : CI->users()) { + uint64_t Index; + if (match(U, m_ExtractElt(m_Value(), m_ConstantInt(Index)))) { + assert((Index == 0 || Index == 1) && "Extract element too small"); + U->replaceAllUsesWith(Index == 0 ? OutReal : OutImag); + } else { + HasVectorUse = true; + } + } + + if (HasVectorUse) { + Value *OutComplex = Builder.CreateInsertElement( + Builder.CreateInsertElement(UndefValue::get(ComplexVectorTy), OutReal, + uint64_t(0)), + OutImag, uint64_t(1)); + CI->replaceAllUsesWith(OutComplex); + } else { + CI->replaceAllUsesWith(UndefValue::get(CI->getType())); + } + + CI->eraseFromParent(); + return true; +} + +bool expandComplexIntrinsics(Function &F, const TargetLowering *TLI) { + bool Changed = false; + SmallVector Worklist; + for (auto &I : instructions(F)) { + if (auto *II = dyn_cast(&I)) { + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::experimental_complex_fmul: + case Intrinsic::experimental_complex_fdiv: + Worklist.push_back(II); + break; + } + } + } + + const DataLayout &DL = F.getParent()->getDataLayout(); + for (auto *II : Worklist) { + Changed |= expandComplexInstruction(II, TLI, DL); + } + return Changed; +} + +class ExpandComplex : public FunctionPass { +public: + static char ID; + ExpandComplex() : FunctionPass(ID) { + initializeExpandComplexPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + const TargetMachine *TM = + &getAnalysis().getTM(); + const TargetSubtargetInfo *SubtargetInfo = TM->getSubtargetImpl(F); + const TargetLowering *TLI = SubtargetInfo->getTargetLowering(); + return expandComplexIntrinsics(F, TLI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } +}; +} // namespace + +char ExpandComplex::ID; +INITIALIZE_PASS_BEGIN(ExpandComplex, "expand-complex", + "Expand complex intrinsics", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(ExpandComplex, "expand-complex", + "Expand complex intrinsics", false, false) + +FunctionPass *llvm::createExpandComplexPass() { return new ExpandComplex(); } + +PreservedAnalyses ExpandComplexPass::run(Function &F, + FunctionAnalysisManager &AM) { + /*const auto &TTI = AM.getResult(F); + if (!expandReductions(F, &TTI)) + return PreservedAnalyses::all();*/ + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -922,6 +922,10 @@ // Allow disabling it for testing purposes. if (!DisableExpandReductions) addPass(createExpandReductionsPass()); + + // If the target doesn't support complex intrinsics, or if they need to be + // expanded into more complex calls, generate the expansion to complex calls. + addPass(createExpandComplexPass()); } /// Turn exception handling constructs into something the code generators can diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -982,6 +982,33 @@ return C; } +Value *IRBuilderBase::CreateComplexMul(Value *L, Value *R, bool CxLimitedRange, + const Twine &Name) { + CallInst *Result = CreateBinaryIntrinsic(Intrinsic::experimental_complex_fmul, + L, R, nullptr, Name); + Result->setFastMathFlags(FMF); + AttributeList Attrs = Result->getAttributes(); + if (CxLimitedRange) + Attrs = Attrs.addFnAttribute(getContext(), "complex-limited-range"); + Result->setAttributes(Attrs); + return Result; +} + +Value *IRBuilderBase::CreateComplexDiv(Value *L, Value *R, bool CxLimitedRange, + bool CxNoScale, const Twine &Name) { + CallInst *Result = CreateBinaryIntrinsic(Intrinsic::experimental_complex_fdiv, + L, R, nullptr, Name); + Result->setFastMathFlags(FMF); + AttributeList Attrs = Result->getAttributes(); + if (CxLimitedRange) + Attrs = Attrs.addFnAttribute(getContext(), "complex-limited-range"); + // complex-limited-range implies complex-no-scale + if (CxNoScale || CxLimitedRange) + Attrs = Attrs.addFnAttribute(getContext(), "complex-no-scale"); + Result->setAttributes(Attrs); + return Result; +} + Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False, const Twine &Name, Instruction *MDFrom) { if (auto *CC = dyn_cast(C)) diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5395,6 +5395,18 @@ &Call); break; } + case Intrinsic::experimental_complex_fdiv: + case Intrinsic::experimental_complex_fmul: { + // Check that the vector type is a pair of floating-point types. + Type *ArgTy = Call.getArgOperand(0)->getType(); + FixedVectorType *VectorTy = dyn_cast(ArgTy); + Assert(VectorTy && VectorTy->getNumElements() % 2 == 0 && + VectorTy->getElementType()->isFloatingPointTy(), + "complex intrinsic must use an even-length vector of floating-point " + "types", + &Call); + break; + } }; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -979,6 +979,8 @@ /// legal as the hook is used before type legalization. bool isSafeMemOpType(MVT VT) const override; + ComplexABI getComplexReturnABI(Type *ScalarFloatTy) const override; + /// Returns true if the target allows unaligned memory accesses of the /// specified type. Returns whether it is "fast" in the last argument. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2563,6 +2563,32 @@ return MVT::i32; } +TargetLoweringBase::ComplexABI +X86TargetLowering::getComplexReturnABI(Type *ScalarFloatTy) const { + if (Subtarget.is32Bit()) { + if (ScalarFloatTy->isFloatTy()) { + report_fatal_error("Cannot compile complex return ABI for i386 ABI"); + } else if (ScalarFloatTy->isHalfTy()) { + return ComplexABI::Vector; + } else { + return ComplexABI::Memory; + } + } else { + // The x86-64 ABI specifies that (save for x86-fp80), this is handled as a + // regular C struct. This means that float and smaller get packed into a + // single vector in xmm0; double and x86-fp80 (by special case) return two + // values; and larger types than x86-fp80 (i.e., fp128) returns via memory. + unsigned FloatSize = ScalarFloatTy->getPrimitiveSizeInBits().getFixedSize(); + if (FloatSize <= 32) { + return ComplexABI::Vector; + } else if (FloatSize <= 80) { + return ComplexABI::Struct; + } else { + return ComplexABI::Memory; + } + } +} + bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) return X86ScalarSSEf32; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -737,6 +737,10 @@ /// /// If the multiplication is known not to overflow then NoSignedWrap is set. Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap); + + /// Try to match a complex intrinsic that produces the given real/imaginary + /// pair. Returns whether or not it was successful. + bool createComplexMathInstruction(Value *Real, Value *Imag); }; class Negator final { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1413,6 +1413,33 @@ eraseInstFromFunction(*PrevSI); return nullptr; } + + // Is this potentially a complex instruction? + auto OurGEP = dyn_cast(Ptr); + auto TheirGEP = dyn_cast(PrevSI->getOperand(1)); + if (PrevSI->isUnordered() && OurGEP && TheirGEP && + OurGEP->getOperand(0) == TheirGEP->getOperand(0) && + OurGEP->getNumIndices() == TheirGEP->getNumIndices() && + OurGEP->getType() == TheirGEP->getType()) { + bool AllMatch = true; + unsigned LastIndex = OurGEP->getNumIndices(); + for (unsigned Index = 1; Index < LastIndex; Index++) { + if (OurGEP->getOperand(Index) != TheirGEP->getOperand(Index)) { + AllMatch = false; + break; + } + } + if (!AllMatch) + break; + if (match(OurGEP->getOperand(LastIndex), m_ConstantInt<1>()) && + match(TheirGEP->getOperand(LastIndex), m_ConstantInt<0>())) { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(PrevSI); + if (createComplexMathInstruction(PrevSI->getOperand(0), Val)) + return &SI; + } + } + break; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -1109,6 +1109,21 @@ if (Instruction *NewI = foldAggregateConstructionIntoAggregateReuse(I)) return NewI; + // Check if this is potentially a complex instruction that has been manually + // expanded. + ArrayRef Fields = I.getType()->subtypes(); + if (Fields.size() == 2 && Fields[0] == Fields[1] && + Fields[0]->isFloatingPointTy()) { + Value *RealV, *ImgV; + if (match(&I, m_InsertValue<1>(m_InsertValue<0>(m_Value(), m_Value(RealV)), + m_Value(ImgV)))) { + IRBuilderBase::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(cast(I.getOperand(0))); + if (createComplexMathInstruction(RealV, ImgV)) + return &I; + } + } + return nullptr; } @@ -1589,6 +1604,17 @@ if (Instruction *Ext = narrowInsElt(IE, Builder)) return Ext; + // Check for a potential computation of a complex instruction. + ElementCount Count = IE.getType()->getElementCount(); + Value *RealV, *ImagV; + if (!Count.isScalable() && Count.getFixedValue() == 2 && + match(&IE, m_InsertElt( + m_InsertElt(m_Value(), m_Value(RealV), m_ConstantInt<0>()), + m_Value(ImagV), m_ConstantInt<1>()))) { + if (createComplexMathInstruction(RealV, ImagV)) + return &IE; + } + return nullptr; } @@ -2793,3 +2819,120 @@ return MadeChange ? &SVI : nullptr; } + +static cl::opt InstCombineComplex( + "inst-combine-complex", + cl::desc("Enable pattern match to llvm.complex.* intrinsics")); + +bool InstCombinerImpl::createComplexMathInstruction(Value *Real, Value *Imag) { + if (!InstCombineComplex) + return false; + + Instruction *RealI = dyn_cast(Real); + Instruction *ImagI = dyn_cast(Imag); + if (!RealI || !ImagI) + return false; + + // Don't try to handle vector instructions for now. + if (RealI->getType()->isVectorTy()) + return false; + + Value *Op0R, *Op0I, *Op1R, *Op1I, *Scale, *Numerator; + // Compute the intersection of all the fast math flags of the entire tree up + // to the point that the input complex numbers are specified. + auto computeFMF = [&]() { + SmallVector Worklist = {RealI, ImagI}; + FastMathFlags Flags; + Flags.set(); + while (!Worklist.empty()) { + Instruction *I = Worklist.back(); + Worklist.pop_back(); + Flags &= I->getFastMathFlags(); + for (Use &U : I->operands()) { + Value *V = U.get(); + if (V == Op0R || V == Op0I || V == Op1R || V == Op1I) + continue; + Worklist.push_back(cast(V)); + } + } + return Flags; + }; + + Intrinsic::ID NewIntrinsic = Intrinsic::not_intrinsic; + // Check for complex multiply: + // real = op0.real * op1.real - op0.imag * op1.imag + // imag = op0.real * op1.imag + op1.imag * op0.real + if (match(Real, m_FSub(m_OneUse(m_FMul(m_Value(Op0R), m_Value(Op1R))), + m_OneUse(m_FMul(m_Value(Op0I), m_Value(Op1I)))))) { + if (match( + Imag, + m_c_FAdd(m_OneUse(m_c_FMul(m_Specific(Op0R), m_Specific(Op1I))), + m_OneUse(m_c_FMul(m_Specific(Op1R), m_Specific(Op0I)))))) { + NewIntrinsic = Intrinsic::experimental_complex_fmul; + } + } + // Check for complex div: + // real = (op0.real * op1.real + op0.imag * op1.imag) / scale + // imag = (op0.imag * op1.real - op0.real * op1.imag) / scale + // where scale = op1.real * op1.real + op1.imag * op1.imag + else if (match(Imag, m_FDiv(m_Value(Numerator), m_Value(Scale)))) { + if (match(Scale, + m_FAdd(m_OneUse(m_FMul(m_Value(Op1R), m_Deferred(Op1R))), + m_OneUse(m_FMul(m_Value(Op1I), m_Deferred(Op1I)))))) { + // The matching of Op1R and Op1I are temporary, we may need to reverse the + // assignments. + auto checkNumerator = [&]() { + return match(Numerator, + m_OneUse(m_FSub( + m_OneUse(m_c_FMul(m_Value(Op0I), m_Specific(Op1R))), + m_OneUse(m_c_FMul(m_Value(Op0R), m_Specific(Op1I)))))); + }; + bool ImagMatches = checkNumerator(); + if (!ImagMatches) { + std::swap(Op1R, Op1I); + ImagMatches = checkNumerator(); + } + if (ImagMatches && + match(Real, + m_FDiv(m_OneUse(m_c_FAdd(m_OneUse(m_c_FMul(m_Specific(Op0R), + m_Specific(Op1R))), + m_OneUse(m_c_FMul(m_Specific(Op0I), + m_Specific(Op1I))))), + m_Specific(Scale)))) { + NewIntrinsic = Intrinsic::experimental_complex_fdiv; + } + } + } + + // Make sure we matched an intrinsic. + if (NewIntrinsic == Intrinsic::not_intrinsic) + return false; + + // Use the computation tree to capture all of the fast-math flags. + IRBuilderBase::FastMathFlagGuard FMFGuard(Builder); + Builder.setFastMathFlags(computeFMF()); + + Value *Op0 = Builder.CreateComplexValue(Op0R, Op0I); + Value *Op1 = Builder.CreateComplexValue(Op1R, Op1I); + + // Create new intrinsics. From our pattern matching of only the direct + // arithmetic formulas, we have to create them with the complex-limited-range. + Value *Result; + switch (NewIntrinsic) { + case Intrinsic::experimental_complex_fmul: + Result = Builder.CreateComplexMul(Op0, Op1, true); + break; + case Intrinsic::experimental_complex_fdiv: + Result = Builder.CreateComplexDiv(Op0, Op1, true); + break; + default: + llvm_unreachable("Unexpected complex intrinsic"); + } + + replaceInstUsesWith(*RealI, + Builder.CreateExtractElement(Result, uint64_t(0))); + replaceInstUsesWith(*ImagI, + Builder.CreateExtractElement(Result, uint64_t(1))); + + return true; +} diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -29,6 +29,7 @@ ; CHECK-NEXT: Expand vector predication intrinsics ; CHECK-NEXT: Scalarize Masked Memory Intrinsics ; CHECK-NEXT: Expand reduction intrinsics +; CHECK-NEXT: Expand complex intrinsics ; CHECK-NEXT: Expand indirectbr instructions ; CHECK-NEXT: Exception handling preparation ; CHECK-NEXT: Safe Stack instrumentation pass diff --git a/llvm/test/CodeGen/X86/complex-32bit.ll b/llvm/test/CodeGen/X86/complex-32bit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/complex-32bit.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-linux-gnu | FileCheck %s + +; Check that we handle the ABI of the complex functions correctly for 32-bit. + +declare <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half>, <2 x half>) +declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>) +declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>) +declare <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80>, <2 x x86_fp80>) +declare <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128>, <2 x fp128>) + +define <2 x half> @intrinsic_f16(<2 x half> %z, <2 x half> %w) { +; CHECK-LABEL: intrinsic_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %esi, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl %edx +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: calll __mulhc3@PLT +; CHECK-NEXT: addl $24, %esp +; CHECK-NEXT: .cfi_adjust_cfa_offset -24 +; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w) + ret <2 x half> %mul +} + +; Skip intrinsic_f32--we don't support complex float on 32-bit for now. + +define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) { +; CHECK-LABEL: intrinsic_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $60, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpl {{[0-9]+}}(%esp) +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: calll __muldc3@PLT +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fxch %st(1) +; CHECK-NEXT: addl $60, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w) + ret <2 x double> %mul +} + +define <2 x x86_fp80> @intrinsic_f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) { +; CHECK-LABEL: intrinsic_f80: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $92, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%esp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%esp) +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: calll __mulxc3@PLT +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fldt {{[0-9]+}}(%esp) +; CHECK-NEXT: fxch %st(1) +; CHECK-NEXT: addl $92, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %mul = call <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) + ret <2 x x86_fp80> %mul +} + +define <2 x fp128> @intrinsic_f128(<2 x fp128> %z, <2 x fp128> %w) { +; CHECK-LABEL: intrinsic_f128: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 20 +; CHECK-NEXT: subl $60, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: .cfi_offset %esi, -20 +; CHECK-NEXT: .cfi_offset %edi, -16 +; CHECK-NEXT: .cfi_offset %ebx, -12 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: subl $12, %esp +; CHECK-NEXT: .cfi_adjust_cfa_offset 12 +; CHECK-NEXT: leal {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: .cfi_adjust_cfa_offset 4 +; CHECK-NEXT: calll __multc3@PLT +; CHECK-NEXT: addl $80, %esp +; CHECK-NEXT: .cfi_adjust_cfa_offset -80 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %ecx, 28(%esi) +; CHECK-NEXT: movl %eax, 24(%esi) +; CHECK-NEXT: movl %ebp, 20(%esi) +; CHECK-NEXT: movl %ebx, 16(%esi) +; CHECK-NEXT: movl %edi, 12(%esi) +; CHECK-NEXT: movl %edx, 8(%esi) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, 4(%esi) +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl %eax, (%esi) +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: addl $60, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 20 +; CHECK-NEXT: popl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popl %edi +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl $4 + %mul = call <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128> %z, <2 x fp128> %w) + ret <2 x fp128> %mul +} + diff --git a/llvm/test/CodeGen/X86/complex-64bit.ll b/llvm/test/CodeGen/X86/complex-64bit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/complex-64bit.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Check that we handle the ABI of the complex functions correctly for 32-bit. + +declare <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half>, <2 x half>) +declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>) +declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>) +declare <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80>, <2 x x86_fp80>) +declare <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128>, <2 x fp128>) + +define <2 x half> @intrinsic_f16(<2 x half> %z, <2 x half> %w) { +; CHECK-LABEL: intrinsic_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: callq __mulhc3@PLT +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x half> @llvm.experimental.complex.fmul.v2f16(<2 x half> %z, <2 x half> %w) + ret <2 x half> %mul +} + +define <2 x float> @intrinsic_f32(<2 x float> %z, <2 x float> %w) { +; CHECK-LABEL: intrinsic_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; CHECK-NEXT: callq __mulsc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + +define <2 x double> @intrinsic_f64(<2 x double> %z, <2 x double> %w) { +; CHECK-LABEL: intrinsic_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] +; CHECK-NEXT: callq __muldc3@PLT +; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %z, <2 x double> %w) + ret <2 x double> %mul +} + +define <2 x x86_fp80> @intrinsic_f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) { +; CHECK-LABEL: intrinsic_f80: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq __mulxc3@PLT +; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x x86_fp80> @llvm.experimental.complex.fmul.v2f80(<2 x x86_fp80> %z, <2 x x86_fp80> %w) + ret <2 x x86_fp80> %mul +} + +define <2 x fp128> @intrinsic_f128(<2 x fp128> %z, <2 x fp128> %w) { +; CHECK-LABEL: intrinsic_f128: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: movq %rsp, %rdi +; CHECK-NEXT: callq __multc3@PLT +; CHECK-NEXT: movaps (%rsp), %xmm0 +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x fp128> @llvm.experimental.complex.fmul.v2f128(<2 x fp128> %z, <2 x fp128> %w) + ret <2 x fp128> %mul +} + diff --git a/llvm/test/CodeGen/X86/complex-divide.ll b/llvm/test/CodeGen/X86/complex-divide.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/complex-divide.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Check the expansion of the complex divide intrinsic. This only tests +; expansion for 32-bit floats, as the expansion should produce identical IR +; expansions save for the ABI of calling __divsc3, which is tested (indirectly) +; for each type individually in intel-complex-{32,64}bit.ll. + +declare <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float>, <2 x float>) + +; Generate a call to __divsc3 +define <2 x float> @intrinsic_slow_f32(<2 x float> %z, <2 x float> %w) { +; CHECK-LABEL: intrinsic_slow_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; CHECK-NEXT: callq __divsc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %div = call <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %div +} + +; Do not do an expansion (because fast is not sufficient to imply full +; complex-limited-range). +define <2 x float> @intrinsic_implied_not_limited_f32(<2 x float> %z, <2 x float> %w) #1 { +; CHECK-LABEL: intrinsic_implied_not_limited_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: vmovaps %xmm1, %xmm2 +; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-NEXT: callq __divsc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %div = call fast <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %div +} + +; Do an expansion (because nnan/ninf + "complex-no-scale"). No arcp nor fma +; should be propagated. +define <2 x float> @intrinsic_implied_limited_f32(<2 x float> %z, <2 x float> %w) #1 { +; CHECK-LABEL: intrinsic_implied_limited_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vaddss {{.*}} %xmm4 +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vaddss {{.*}} %xmm5 +; CHECK-NEXT: vdivss %xmm4, %xmm5, %xmm5 +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vdivss %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[2,3] +; CHECK-NEXT: retq + %div = call nnan ninf <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) #2 + ret <2 x float> %div +} + +; Do an expansion (because of complex-limited-range). +define <2 x float> @intrinsic_limited_f32(<2 x float> %z, <2 x float> %w) #1 { +; CHECK-LABEL: intrinsic_limited_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vaddss {{.*}} %xmm4 +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vaddss {{.*}} %xmm5 +; CHECK-NEXT: vdivss %xmm4, %xmm5, %xmm5 +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vdivss %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[2,3] +; CHECK-NEXT: retq + %div = call <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) #0 + ret <2 x float> %div +} + +; Do an expansion, and use the FMA (because of fast-math flags). +define <2 x float> @intrinsic_fast_f32(<2 x float> %z, <2 x float> %w) #1 { +; CHECK-LABEL: intrinsic_fast_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-NEXT: vmulss %xmm3, %xmm3, %xmm4 +; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm4 = (xmm1 * xmm1) + xmm4 +; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm5 +; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm5 = (xmm0 * xmm1) + xmm5 +; CHECK-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero +; CHECK-NEXT: vdivss %xmm4, %xmm6, %xmm4 +; CHECK-NEXT: vmulss %xmm4, %xmm5, %xmm5 +; CHECK-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm2 * xmm1) - xmm0 +; CHECK-NEXT: vmulss %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[2,3] +; CHECK-NEXT: retq + %div = call fast <2 x float> @llvm.experimental.complex.fdiv.v2f32(<2 x float> %z, <2 x float> %w) #0 + ret <2 x float> %div +} + +attributes #0 = { "complex-limited-range"="true" } +attributes #1 = { "target-features"="+fma" } +attributes #2 = { "complex-no-scale"="true" } diff --git a/llvm/test/CodeGen/X86/complex-multiply.ll b/llvm/test/CodeGen/X86/complex-multiply.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/complex-multiply.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Check the expansion of the complex multiply intrinsic. This only tests +; expansion for 32-bit floats, as the expansion should produce identical IR +; expansions save for ABI of calling __mulsc3, which is tested for each type +; individually in intel-complex-{32,64}bit.ll. + +declare <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float>, <2 x float>) + +; Generate a call to __mulsc3 +define <2 x float> @intrinsic_slow_f32(<2 x float> %z, <2 x float> %w) { +; CHECK-LABEL: intrinsic_slow_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: movaps %xmm1, %xmm2 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] +; CHECK-NEXT: movaps %xmm2, %xmm3 +; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[1,1] +; CHECK-NEXT: callq __mulsc3@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq + %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + +; Do an expansion (because of fast-math flags). +define <2 x float> @intrinsic_implied_limited_f32(<2 x float> %z, <2 x float> %w) #1 { +; CHECK-LABEL: intrinsic_implied_limited_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vsubss {{.*}} %xmm4 +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vaddss {{.*}} %xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[2,3] +; CHECK-NEXT: retq + %mul = call nnan ninf <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + +; Do an expansion (because of complex-limited-range). +define <2 x float> @intrinsic_limited_f32(<2 x float> %z, <2 x float> %w) #1 { +; CHECK-LABEL: intrinsic_limited_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vsubss {{.*}} %xmm4 +; CHECK-COUNT-2: vmulss +; CHECK-NEXT: vaddss {{.*}} %xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[2,3] +; CHECK-NEXT: retq + %mul = call <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) #0 + ret <2 x float> %mul +} + +; Do an expansion, and use the FMA (because of fast-math flags). +define <2 x float> @intrinsic_fast_f32(<2 x float> %z, <2 x float> %w) #1 { +; CHECK-LABEL: intrinsic_fast_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-NEXT: vmulss %xmm3, %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231ss {{.*#+}} xmm4 = (xmm0 * xmm1) - xmm4 +; CHECK-NEXT: vmulss %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vfmadd231ss {{.*#+}} xmm0 = (xmm2 * xmm1) + xmm0 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[2,3] +; CHECK-NEXT: retq + %mul = call fast <2 x float> @llvm.experimental.complex.fmul.v2f32(<2 x float> %z, <2 x float> %w) + ret <2 x float> %mul +} + +attributes #0 = { "complex-limited-range"="true" } +attributes #1 = { "target-features"="+fma" } diff --git a/llvm/test/Transforms/InstCombine/complex-math.ll b/llvm/test/Transforms/InstCombine/complex-math.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/complex-math.ll @@ -0,0 +1,279 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals +; RUN: opt < %s -instcombine -S -inst-combine-complex | FileCheck %s + +; Check that we match the simple expansions of complex multiplication and +; division, whether the target complex value is made by returning a struct, +; vector, or by storing into memory. + +%complex.double = type {double, double} + +define %complex.double @struct_mul(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @struct_mul( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %x = fsub double %ac, %bd + %y = fadd double %ad, %bc + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +define <2 x double> @vector_mul(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @vector_mul( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: ret <2 x double> [[TMP5]] +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %x = fsub double %ac, %bd + %y = fadd double %ad, %bc + %res = insertelement <2 x double> zeroinitializer, double %x, i32 0 + %res.1 = insertelement <2 x double> %res, double %y, i32 1 + ret <2 x double> %res.1 +} + +define void @memory_mul(double %a, double %b, double %c, double %d, %complex.double* %dest) { +; CHECK-LABEL: @memory_mul( +; CHECK-NEXT: [[DEST_REAL:%.*]] = getelementptr [[COMPLEX_DOUBLE:%.*]], %complex.double* [[DEST:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[DEST_IMAG:%.*]] = getelementptr [[COMPLEX_DOUBLE]], %complex.double* [[DEST]], i64 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: store double [[TMP6]], double* [[DEST_REAL]], align 8 +; CHECK-NEXT: store double [[TMP7]], double* [[DEST_IMAG]], align 8 +; CHECK-NEXT: ret void +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %x = fsub double %ac, %bd + %y = fadd double %ad, %bc + %dest.real = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 0 + %dest.imag = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 1 + store double %x, double* %dest.real + store double %y, double* %dest.imag + ret void +} + +define %complex.double @fast_mul(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @fast_mul( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul fast double %a, %c + %bd = fmul fast double %b, %d + %ad = fmul fast double %a, %d + %bc = fmul fast double %b, %c + %x = fsub fast double %ac, %bd + %y = fadd fast double %ad, %bc + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +define %complex.double @fastish_mul(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @fastish_mul( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call ninf <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR1]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul fast double %a, %c + %bd = fmul nnan ninf nsz double %b, %d + %ad = fmul ninf arcp contract double %a, %d + %bc = fmul reassoc nsz ninf double %b, %c + %x = fsub ninf arcp afn double %ac, %bd + %y = fadd afn nnan ninf double %ad, %bc + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +define %complex.double @struct_div(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @struct_div( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %cc = fmul double %c, %c + %dd = fmul double %d, %d + %scale = fadd double %cc, %dd + %x_noscale = fadd double %ac, %bd + %y_noscale = fsub double %bc, %ad + %x = fdiv double %x_noscale, %scale + %y = fdiv double %y_noscale, %scale + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +define <2 x double> @vector_div(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @vector_div( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR2]] +; CHECK-NEXT: ret <2 x double> [[TMP5]] +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %cc = fmul double %c, %c + %dd = fmul double %d, %d + %scale = fadd double %cc, %dd + %x_noscale = fadd double %ac, %bd + %y_noscale = fsub double %bc, %ad + %x = fdiv double %x_noscale, %scale + %y = fdiv double %y_noscale, %scale + %res = insertelement <2 x double> zeroinitializer, double %x, i32 0 + %res.1 = insertelement <2 x double> %res, double %y, i32 1 + ret <2 x double> %res.1 +} + +define void @memory_div(double %a, double %b, double %c, double %d, %complex.double* %dest) { +; CHECK-LABEL: @memory_div( +; CHECK-NEXT: [[DEST_REAL:%.*]] = getelementptr [[COMPLEX_DOUBLE:%.*]], %complex.double* [[DEST:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[DEST_IMAG:%.*]] = getelementptr [[COMPLEX_DOUBLE]], %complex.double* [[DEST]], i64 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: store double [[TMP6]], double* [[DEST_REAL]], align 8 +; CHECK-NEXT: store double [[TMP7]], double* [[DEST_IMAG]], align 8 +; CHECK-NEXT: ret void +; + %ac = fmul double %a, %c + %bd = fmul double %b, %d + %ad = fmul double %a, %d + %bc = fmul double %b, %c + %cc = fmul double %c, %c + %dd = fmul double %d, %d + %scale = fadd double %cc, %dd + %x_noscale = fadd double %ac, %bd + %y_noscale = fsub double %bc, %ad + %x = fdiv double %x_noscale, %scale + %y = fdiv double %y_noscale, %scale + %dest.real = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 0 + %dest.imag = getelementptr %complex.double, %complex.double* %dest, i64 0, i32 1 + store double %x, double* %dest.real + store double %y, double* %dest.imag + ret void +} + +define %complex.double @fast_div(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @fast_div( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul fast double %a, %c + %bd = fmul fast double %b, %d + %ad = fmul fast double %a, %d + %bc = fmul fast double %b, %c + %cc = fmul fast double %c, %c + %dd = fmul fast double %d, %d + %scale = fadd fast double %cc, %dd + %x_noscale = fadd fast double %ac, %bd + %y_noscale = fsub fast double %bc, %ad + %x = fdiv fast double %x_noscale, %scale + %y = fdiv fast double %y_noscale, %scale + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +define %complex.double @fastish_div(double %a, double %b, double %c, double %d) { +; CHECK-LABEL: @fastish_div( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[D:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = call arcp <2 x double> @llvm.experimental.complex.fdiv.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) #[[ATTR2]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 +; CHECK-NEXT: [[RES:%.*]] = insertvalue [[COMPLEX_DOUBLE:%.*]] zeroinitializer, double [[TMP6]], 0 +; CHECK-NEXT: [[RES_1:%.*]] = insertvalue [[COMPLEX_DOUBLE]] [[RES]], double [[TMP7]], 1 +; CHECK-NEXT: ret [[COMPLEX_DOUBLE]] [[RES_1]] +; + %ac = fmul arcp contract double %a, %c + %bd = fmul arcp afn ninf reassoc double %b, %d + %ad = fmul arcp afn ninf double %a, %d + %bc = fmul arcp nsz reassoc double %b, %c + %cc = fmul arcp nsz afn double %c, %c + %dd = fmul arcp nsz double %d, %d + %scale = fadd arcp nsz contract nnan reassoc double %cc, %dd + %x_noscale = fadd arcp nsz contract ninf nnan double %ac, %bd + %y_noscale = fsub arcp nsz contract reassoc double %bc, %ad + %x = fdiv arcp ninf nnan reassoc double %x_noscale, %scale + %y = fdiv arcp nnan double %y_noscale, %scale + %res = insertvalue %complex.double zeroinitializer, double %x, 0 + %res.1 = insertvalue %complex.double %res, double %y, 1 + ret %complex.double %res.1 +} + +;. +; CHECK: attributes #[[ATTR0:[0-9]+]] = { nofree nosync nounwind readnone willreturn } +; CHECK: attributes #[[ATTR1]] = { "complex-limited-range" } +; CHECK: attributes #[[ATTR2]] = { "complex-limited-range" "complex-no-scale" } +;. diff --git a/llvm/test/Verifier/complex-intrinsics.ll b/llvm/test/Verifier/complex-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Verifier/complex-intrinsics.ll @@ -0,0 +1,39 @@ +; RUN: opt -verify -S < %s 2>&1 | FileCheck --check-prefix=CHECK1 %s +; RUN: opt -verify -S < %s 2>&1 | FileCheck --check-prefix=CHECK2 %s +; RUN: sed -e s/.T3:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK3 %s +; RUN: sed -e s/.T4:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK4 %s + +; Check that a double-valued complex fmul is accepted, and attributes are +; correct. +; CHECK1: declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>) #[[ATTR:[0-9]+]] +; CHECK1: attributes #[[ATTR]] = { nofree nosync nounwind readnone willreturn } +declare <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double>, <2 x double>) +define <2 x double> @t1(<2 x double> %a, <2 x double> %b) { + %res = call <2 x double> @llvm.experimental.complex.fmul.v2f64(<2 x double> %a, <2 x double> %b) + ret <2 x double> %res +} + +; Test that vector complex values are supported. +; CHECK2: declare <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double>, <4 x double>) #[[ATTR:[0-9]+]] +; CHECK2: attributes #[[ATTR]] = { nofree nosync nounwind readnone willreturn } +declare <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double>, <4 x double>) +define <4 x double> @t2(<4 x double> %a, <4 x double> %b) { + %res = call <4 x double> @llvm.experimental.complex.fmul.v4f64(<4 x double> %a, <4 x double> %b) + ret <4 x double> %res +} + +; Test that odd-length vectors are not supported. +; CHECK3: complex intrinsic must use an even-length vector of floating-point types +;T3: declare <3 x double> @llvm.experimental.complex.fmul.v3f64(<3 x double>, <3 x double>) +;T3: define <3 x double> @t3(<3 x double> %a, <3 x double> %b) { +;T3: %res = call <3 x double> @llvm.experimental.complex.fmul.v3f64(<3 x double> %a, <3 x double> %b) +;T3: ret <3 x double> %res +;T3: } + +; Test that non-floating point complex types are not supported. +; CHECK4: complex intrinsic must use an even-length vector of floating-point types +;T4: declare <2 x i64> @llvm.experimental.complex.fmul.v2i64(<2 x i64>, <2 x i64>) +;T4: define <2 x i64> @t4(<2 x i64> %a, <2 x i64> %b) { +;T4: %res = call <2 x i64> @llvm.experimental.complex.fmul.v2i64(<2 x i64> %a, <2 x i64> %b) +;T4: ret <2 x i64> %res +;T4: }