diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -596,6 +596,7 @@ * 64-bit ARM (AArch64) * AMDGPU * SPIR +* X86 ``_Float16`` will be supported on more targets as they define ABIs for it. diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -164,7 +164,7 @@ X86 Support in Clang -------------------- -- ... +- Support for ``_Float16`` type has been added. Internal API Changes -------------------- diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -341,6 +341,7 @@ .Case("+sse", SSE1) .Default(NoSSE); SSELevel = std::max(SSELevel, Level); + HasFloat16 = SSELevel >= SSE2; MMX3DNowEnum ThreeDNowLevel = llvm::StringSwitch(Feature) .Case("+3dnowa", AMD3DNowAthlon) diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -2814,6 +2814,8 @@ Current = Integer; } else if (k == BuiltinType::Float || k == BuiltinType::Double) { Current = SSE; + } else if (k == BuiltinType::Float16) { + Current = SSE; } else if (k == BuiltinType::LongDouble) { const llvm::fltSemantics *LDF = &getTarget().getLongDoubleFormat(); if (LDF == &llvm::APFloat::IEEEquad()) { @@ -2943,6 +2945,8 @@ Current = Integer; else if (Size <= 128) Lo = Hi = Integer; + } else if (ET->isFloat16Type()) { + Current = SSE; } else if (ET == getContext().FloatTy) { Current = SSE; } else if (ET == getContext().DoubleTy) { @@ -3396,18 +3400,61 @@ return false; } +/// ContainsHalfAtOffset - Return true if the specified LLVM IR type has a +/// half member at the specified offset. For example, {int,{half}} has a +/// float at offset 4. It is conservatively correct for this routine to return +/// false. +/// FIXME: Merge with ContainsFloatAtOffset +static bool ContainsHalfAtOffset(llvm::Type *IRType, unsigned IROffset, + const llvm::DataLayout &TD) { + // Base case if we find a float. + if (IROffset == 0 && IRType->isHalfTy()) + return true; + + // If this is a struct, recurse into the field at the specified offset. + if (llvm::StructType *STy = dyn_cast(IRType)) { + const llvm::StructLayout *SL = TD.getStructLayout(STy); + unsigned Elt = SL->getElementContainingOffset(IROffset); + IROffset -= SL->getElementOffset(Elt); + return ContainsHalfAtOffset(STy->getElementType(Elt), IROffset, TD); + } + + // If this is an array, recurse into the field at the specified offset. + if (llvm::ArrayType *ATy = dyn_cast(IRType)) { + llvm::Type *EltTy = ATy->getElementType(); + unsigned EltSize = TD.getTypeAllocSize(EltTy); + IROffset -= IROffset / EltSize * EltSize; + return ContainsHalfAtOffset(EltTy, IROffset, TD); + } + + return false; +} /// GetSSETypeAtOffset - Return a type that will be passed by the backend in the /// low 8 bytes of an XMM register, corresponding to the SSE class. llvm::Type *X86_64ABIInfo:: GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset, QualType SourceTy, unsigned SourceOffset) const { - // The only three choices we have are either double, <2 x float>, or float. We - // pass as float if the last 4 bytes is just padding. This happens for - // structs that contain 3 floats. - if (BitsContainNoUserData(SourceTy, SourceOffset*8+32, - SourceOffset*8+64, getContext())) + // If only 16 bits are used, pass in half. + if (BitsContainNoUserData(SourceTy, SourceOffset * 8 + 16, + SourceOffset * 8 + 64, getContext())) + return llvm::Type::getHalfTy(getVMContext()); + + // If only 32 bits are used, we have two choices. Single float or two halfs. + if (BitsContainNoUserData(SourceTy, SourceOffset * 8 + 32, + SourceOffset * 8 + 64, getContext())) { + if (ContainsHalfAtOffset(IRType, IROffset, getDataLayout()) && + ContainsHalfAtOffset(IRType, IROffset + 2, getDataLayout())) + return llvm::FixedVectorType::get(llvm::Type::getHalfTy(getVMContext()), + 2); return llvm::Type::getFloatTy(getVMContext()); + } + + // If 48 bits are used, we pass as <3 x half>. + if (BitsContainNoUserData(SourceTy, SourceOffset * 8 + 48, + SourceOffset * 8 + 64, getContext())) { + return llvm::FixedVectorType::get(llvm::Type::getHalfTy(getVMContext()), 3); + } // We want to pass as <2 x float> if the LLVM IR type contains a float at // offset+0 and offset+4. Walk the LLVM IR type to find out if this is the @@ -3417,6 +3464,17 @@ return llvm::FixedVectorType::get(llvm::Type::getFloatTy(getVMContext()), 2); + // We want to pass as <4 x half> if the LLVM IR type contains a half at + // offset+0, +2, +4, +6. Walk the LLVM IR type to find out if this is the + // case. + if (ContainsHalfAtOffset(IRType, IROffset, getDataLayout()) && + ContainsHalfAtOffset(IRType, IROffset + 2, getDataLayout()) && + ContainsHalfAtOffset(IRType, IROffset + 4, getDataLayout()) && + ContainsHalfAtOffset(IRType, IROffset + 6, getDataLayout())) + return llvm::FixedVectorType::get(llvm::Type::getHalfTy(getVMContext()), 4); + + // TODO: What about mixes of float and half? + return llvm::Type::getDoubleTy(getVMContext()); } diff --git a/clang/test/CodeGen/X86/fp16-abi.c b/clang/test/CodeGen/X86/fp16-abi.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/fp16-abi.c @@ -0,0 +1,113 @@ +// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -target-feature +avx512fp16 < %s | FileCheck %s --check-prefixes=CHECK + +struct half1 { + _Float16 a; +}; + +struct half1 h1(_Float16 a) { + // CHECK: define{{.*}}half @h1 + struct half1 x; + x.a = a; + return x; +} + +struct half2 { + _Float16 a; + _Float16 b; +}; + +struct half2 h2(_Float16 a, _Float16 b) { + // CHECK: define{{.*}}<2 x half> @h2 + struct half2 x; + x.a = a; + x.b = b; + return x; +} + +struct half3 { + _Float16 a; + _Float16 b; + _Float16 c; +}; + +struct half3 h3(_Float16 a, _Float16 b, _Float16 c) { + // CHECK: define{{.*}}<3 x half> @h3 + struct half3 x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct half4 { + _Float16 a; + _Float16 b; + _Float16 c; + _Float16 d; +}; + +struct half4 h4(_Float16 a, _Float16 b, _Float16 c, _Float16 d) { + // CHECK: define{{.*}}<4 x half> @h4 + struct half4 x; + x.a = a; + x.b = b; + x.c = c; + x.d = d; + return x; +} + +struct floathalf { + float a; + _Float16 b; +}; + +struct floathalf fh(float a, _Float16 b) { + // CHECK: <3 x half> @fh + struct floathalf x; + x.a = a; + x.b = b; + return x; +} + +struct floathalf2 { + float a; + _Float16 b; + _Float16 c; +}; + +struct floathalf2 fh2(float a, _Float16 b, _Float16 c) { + // CHECK: define{{.*}}double @fh2 + struct floathalf2 x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct halffloat { + _Float16 a; + float b; +}; + +struct halffloat hf(_Float16 a, float b) { + // CHECK: define{{.*}}double @hf + struct halffloat x; + x.a = a; + x.b = b; + return x; +} + +struct half2float { + _Float16 a; + _Float16 b; + float c; +}; + +struct half2float h2f(_Float16 a, _Float16 b, float c) { + // CHECK: define{{.*}}double @h2f + struct half2float x; + x.a = a; + x.b = b; + x.c = c; + return x; +} diff --git a/clang/test/Sema/Float16.c b/clang/test/Sema/Float16.c --- a/clang/test/Sema/Float16.c +++ b/clang/test/Sema/Float16.c @@ -1,18 +1,14 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -triple i686-linux-pc %s -target-feature +sse2 // RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-linux-pc %s -// RUN: %clang_cc1 -fsyntax-only -verify -triple spir-unknown-unknown %s -DHAVE -// RUN: %clang_cc1 -fsyntax-only -verify -triple armv7a-linux-gnu %s -DHAVE -// RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64-linux-gnu %s -DHAVE +// RUN: %clang_cc1 -fsyntax-only -verify -triple spir-unknown-unknown %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple armv7a-linux-gnu %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64-linux-gnu %s -#ifndef HAVE -// expected-error@+2{{_Float16 is not supported on this target}} -#endif // HAVE _Float16 f; -#ifdef HAVE // FIXME: Should this be valid? _Complex _Float16 a; // expected-error {{'_Complex _Float16' is invalid}} void builtin_complex() { _Float16 a = 0; (void)__builtin_complex(a, a); // expected-error {{'_Complex _Float16' is invalid}} } -#endif diff --git a/clang/test/Sema/conversion-target-dep.c b/clang/test/Sema/conversion-target-dep.c --- a/clang/test/Sema/conversion-target-dep.c +++ b/clang/test/Sema/conversion-target-dep.c @@ -6,7 +6,7 @@ long double ld; double d; -_Float16 f16; // x86-error {{_Float16 is not supported on this target}} +_Float16 f16; int main() { ld = d; // x86-warning {{implicit conversion increases floating-point precision: 'double' to 'long double'}} diff --git a/clang/test/SemaCXX/Float16.cpp b/clang/test/SemaCXX/Float16.cpp --- a/clang/test/SemaCXX/Float16.cpp +++ b/clang/test/SemaCXX/Float16.cpp @@ -1,18 +1,11 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -triple i686-linux-pc %s -target-feature +sse2 // RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-linux-pc %s -// RUN: %clang_cc1 -fsyntax-only -verify -triple spir-unknown-unknown %s -DHAVE -// RUN: %clang_cc1 -fsyntax-only -verify -triple armv7a-linux-gnu %s -DHAVE -// RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64-linux-gnu %s -DHAVE +// RUN: %clang_cc1 -fsyntax-only -verify -triple spir-unknown-unknown %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple armv7a-linux-gnu %s +// RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64-linux-gnu %s -#ifdef HAVE // expected-no-diagnostics -#endif // HAVE -#ifndef HAVE -// expected-error@+2{{_Float16 is not supported on this target}} -#endif // !HAVE _Float16 f; -#ifndef HAVE -// expected-error@+2{{invalid suffix 'F16' on floating constant}} -#endif // !HAVE const auto g = 1.1F16; diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -104,6 +104,8 @@ During this release ... +* Support for ``_Float16`` ABI calling conversion has been added. + Changes to the AMDGPU Target ----------------------------- diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -287,6 +287,7 @@ HANDLE_LIBCALL(FPEXT_F64_F128, "__extenddftf2") HANDLE_LIBCALL(FPEXT_F32_F128, "__extendsftf2") HANDLE_LIBCALL(FPEXT_F16_F128, "__extendhftf2") +HANDLE_LIBCALL(FPEXT_F16_F80, "__extendhfxf2") HANDLE_LIBCALL(FPEXT_F32_F64, "__extendsfdf2") HANDLE_LIBCALL(FPEXT_F16_F64, "__extendhfdf2") HANDLE_LIBCALL(FPEXT_F16_F32, "__gnu_h2f_ieee") diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -1051,6 +1051,10 @@ let IsLoad = true; let ScalarMemoryVT = i32; } +def extloadvf16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { + let IsLoad = true; + let ScalarMemoryVT = f16; +} def extloadvf32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { let IsLoad = true; let ScalarMemoryVT = f32; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -820,6 +820,7 @@ case ISD::BITCAST: Res = SoftenFloatOp_BITCAST(N); break; case ISD::BR_CC: Res = SoftenFloatOp_BR_CC(N); break; + case ISD::STRICT_FP_TO_FP16: case ISD::FP_TO_FP16: // Same as FP_ROUND for softening purposes case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: Res = SoftenFloatOp_FP_ROUND(N); break; @@ -871,13 +872,17 @@ // We actually deal with the partially-softened FP_TO_FP16 node too, which // returns an i16 so doesn't meet the constraints necessary for FP_ROUND. assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16 || + N->getOpcode() == ISD::STRICT_FP_TO_FP16 || N->getOpcode() == ISD::STRICT_FP_ROUND); bool IsStrict = N->isStrictFPOpcode(); SDValue Op = N->getOperand(IsStrict ? 1 : 0); EVT SVT = Op.getValueType(); EVT RVT = N->getValueType(0); - EVT FloatRVT = N->getOpcode() == ISD::FP_TO_FP16 ? MVT::f16 : RVT; + EVT FloatRVT = (N->getOpcode() == ISD::FP_TO_FP16 || + N->getOpcode() == ISD::STRICT_FP_TO_FP16) + ? MVT::f16 + : RVT; RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall"); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -236,6 +236,8 @@ return FPEXT_F16_F32; if (RetVT == MVT::f64) return FPEXT_F16_F64; + if (RetVT == MVT::f80) + return FPEXT_F16_F80; if (RetVT == MVT::f128) return FPEXT_F16_F128; } else if (OpVT == MVT::f32) { diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -233,25 +233,28 @@ // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 // can only be used by ABI non-compliant code. If the target doesn't have XMM // registers, it won't have vector types. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX target feature. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX-512 target feature. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, // MMX vector types are always returned in MM0. If the target doesn't have // MM0, it doesn't support these vector types. CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, + // Half type is always returned in XMM0. + CCIfSubtarget<"hasSSE2()", CCIfType<[f16], CCAssignToReg<[XMM0, XMM1]>>>, + // Long double types are always returned in FP0 (even with SSE), // except on Win64. CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>> @@ -265,7 +268,7 @@ // case they use XMM0, otherwise it is the same as the common X86 calling // conv. CCIfInReg>>>, + CCIfType<[f16, f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>, CCDelegateTo ]>; @@ -276,6 +279,7 @@ // SSE2. // This can happen when a float, 2 x float, or 3 x float vector is split by // target lowering, and is returned in 1-3 sse regs. + CCIfType<[f16], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, @@ -552,7 +556,7 @@ CCIfType<[v64i1], CCPromoteToType>, // The first 8 FP/Vector arguments are passed in XMM registers. - CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[f16, f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, @@ -561,33 +565,33 @@ // FIXME: This isn't precisely correct; the x86-64 ABI document says that // fixed arguments to vararg functions are supposed to be passed in // registers. Actually modeling that would be a lot of work, though. - CCIfNotVarArg>>>, // The first 8 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>>, // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. - CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + CCIfType<[i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>, // Long doubles get stack slots whose size and alignment depends on the // subtarget. CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -635,13 +639,13 @@ CCIfCFGuardTarget>, // 128 bit vectors are passed by pointer - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCPassIndirect>, // 256 bit vectors are passed by pointer - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect>, + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCPassIndirect>, // 512 bit vectors are passed by pointer - CCIfType<[v64i8, v32i16, v16i32, v16f32, v8f64, v8i64], CCPassIndirect>, + CCIfType<[v64i8, v32i16, v16i32, v32f16, v16f32, v8f64, v8i64], CCPassIndirect>, // Long doubles are passed by pointer CCIfType<[f80], CCPassIndirect>, @@ -655,7 +659,7 @@ CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType>>, // The first 4 FP/Vector arguments are passed in XMM registers. - CCIfType<[f32, f64], + CCIfType<[f16, f32, f64], CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3], [RCX , RDX , R8 , R9 ]>>, @@ -678,7 +682,7 @@ // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. - CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>> + CCIfType<[i8, i16, i32, i64, f16, f32, f64], CCAssignToStack<8, 8>> ]>; def CC_X86_Win64_VectorCall : CallingConv<[ @@ -735,7 +739,7 @@ // floating-point arguments are aligned to 4 byte and stored in 4 byte slots. // 64bit integer and floating-point arguments are aligned to 8 byte and stored // in 8 byte stack slots. - CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i32, f16, f32], CCAssignToStack<4, 4>>, CCIfType<[i64, f64], CCAssignToStack<8, 8>> ]>; @@ -757,14 +761,15 @@ /// values are spilled on the stack. def CC_X86_32_Vector_Common : CallingConv<[ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCAssignToStack<16, 16>>, // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -772,16 +777,16 @@ // vector registers def CC_X86_32_Vector_Standard : CallingConv<[ // SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, CCDelegateTo @@ -791,16 +796,16 @@ // vector registers. def CC_X86_32_Vector_Darwin : CallingConv<[ // SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, CCDelegateTo @@ -815,7 +820,7 @@ // The first 3 float or double arguments, if marked 'inreg' and if the call // is not a vararg call and if SSE2 is available, are passed in SSE registers. - CCIfNotVarArg>>>>, @@ -824,6 +829,8 @@ CCIfNotVarArg>>, + CCIfType<[f16], CCAssignToStack<4, 4>>, + // Integer/Float values get stored in stack slots that are 4 bytes in // size and 4-byte aligned. CCIfType<[i32, f32], CCAssignToStack<4, 4>>, diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -55,6 +55,7 @@ /// When SSE2 is available, use it for f64 operations. bool X86ScalarSSEf64; bool X86ScalarSSEf32; + bool X86ScalarSSEf16; public: explicit X86FastISel(FunctionLoweringInfo &funcInfo, @@ -63,6 +64,7 @@ Subtarget = &funcInfo.MF->getSubtarget(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); + X86ScalarSSEf16 = Subtarget->hasSSE2(); } bool fastSelectInstruction(const Instruction *I) override; @@ -157,7 +159,8 @@ /// computed in an SSE register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1 + (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when SSE2 } bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); @@ -2286,6 +2289,8 @@ case MVT::i8: Opc = X86::CMOV_GR8; break; case MVT::i16: Opc = X86::CMOV_GR16; break; case MVT::i32: Opc = X86::CMOV_GR32; break; + case MVT::f16: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR16X + : X86::CMOV_FR16; break; case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break; case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X @@ -3907,6 +3912,9 @@ unsigned Opc = 0; switch (VT.SimpleTy) { default: return 0; + case MVT::f16: + Opc = HasAVX512 ? X86::AVX512_FsFLD0SH : X86::FsFLD0SH; + break; case MVT::f32: if (X86ScalarSSEf32) Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS; diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1121,7 +1121,10 @@ if (VT.isVector() || VT == MVT::f128) break; - MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32; + MVT VecVT = VT == MVT::f64 ? MVT::v2f64 + : VT == MVT::f32 ? MVT::v4f32 + : MVT::v8f16; + SDLoc dl(N); SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N->getOperand(0)); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -998,7 +998,8 @@ bool isCtlzFast() const override; bool hasBitPreservingFPLogic(EVT VT) const override { - return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); + return VT == MVT::f16 || VT == MVT::f32 || VT == MVT::f64 || + VT.isVector(); } bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { @@ -1282,7 +1283,8 @@ /// register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1 + (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when SSE2 } /// Returns true if it is beneficial to convert a load of a constant @@ -1439,7 +1441,8 @@ /// Select between SSE or x87 floating point ops. /// When SSE is available, use it for f32 operations. - /// When SSE2 is available, use it for f64 operations. + /// When SSE2 is available, use it for f16 and f64 operations. + bool X86ScalarSSEf16; bool X86ScalarSSEf32; bool X86ScalarSSEf64; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -117,6 +117,7 @@ bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); X86ScalarSSEf64 = Subtarget.hasSSE2(); X86ScalarSSEf32 = Subtarget.hasSSE1(); + X86ScalarSSEf16 = Subtarget.hasSSE2(); MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); // Set up the TargetLowering object. @@ -550,9 +551,13 @@ setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) { - // f32 and f64 use SSE. + // f16, f32 and f64 use SSE. // Set up the FP register classes. + addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass + : &X86::FR16RegClass); addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass); addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass @@ -584,6 +589,26 @@ setOperationAction(ISD::FSINCOS, VT, Expand); } + // Half type will be promoted by default. + setOperationAction(ISD::FABS, MVT::f16, Promote); + setOperationAction(ISD::FNEG, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); + setOperationAction(ISD::FADD, MVT::f16, Promote); + setOperationAction(ISD::FSUB, MVT::f16, Promote); + setOperationAction(ISD::FMUL, MVT::f16, Promote); + setOperationAction(ISD::FDIV, MVT::f16, Promote); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::f16, Promote); + setOperationAction(ISD::BR_CC, MVT::f16, Promote); + setOperationAction(ISD::SETCC, MVT::f16, Promote); + setOperationAction(ISD::SELECT, MVT::f16, Promote); + setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); + setOperationAction(ISD::FP_ROUND, MVT::f16, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); + // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); @@ -658,6 +683,10 @@ } else // SSE immediates. addLegalFPImmediate(APFloat(+0.0)); // xorpd } + // Support fp16 0 immediate. + if (isTypeLegal(MVT::f16)) + addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf())); + // Handle constrained floating-point operations of scalar. setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); @@ -667,7 +696,6 @@ setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); @@ -676,6 +704,7 @@ // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); + setOperationAction(ISD::FMA, MVT::f16, Promote); // f80 always uses X87. if (UseX87) { @@ -719,7 +748,7 @@ setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten // as Custom. setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); @@ -785,6 +814,7 @@ } // Always use a library call for pow. + setOperationAction(ISD::FPOW , MVT::f16 , Promote); setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); @@ -1438,6 +1468,13 @@ } } + if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) { + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); + } + // This block controls legalization of the mask vector sizes that are // available with AVX512. 512-bit vectors are in a separate block controlled // by useAVX512Regs. @@ -3668,6 +3705,8 @@ RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; + else if (RegVT == MVT::f16) + RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; else if (RegVT == MVT::f32) RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; else if (RegVT == MVT::f64) @@ -18779,7 +18818,7 @@ MVT VT = Op.getSimpleValueType(); - if (VT.getSizeInBits() == 16) { + if (VT == MVT::i16) { // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless // we're going to zero extend the register or fold the store (SSE41 only). if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) && @@ -19960,6 +19999,9 @@ MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + if (VT == MVT::f16) + return SDValue(); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; @@ -20451,7 +20493,7 @@ MVT DstVT = Op->getSimpleValueType(0); SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); - if (DstVT == MVT::f128) + if (DstVT == MVT::f128 || DstVT == MVT::f16) return SDValue(); if (DstVT.isVector()) @@ -21371,6 +21413,9 @@ assert(!VT.isVector()); + if (SrcVT == MVT::f16) + return SDValue(); + bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT); if (!IsSigned && UseSSEReg) { @@ -21535,6 +21580,11 @@ EVT DstVT = Node->getValueType(0); EVT TmpVT = DstVT; + if (SrcVT.getSimpleVT() == MVT::f16) + return DAG.getNode(Node->getOpcode(), dl, DstVT, + DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Src), + Node->getOperand(1)); + // This code is only for floats and doubles. Fall back to generic code for // anything else. if (!isScalarFPTypeInSSEReg(SrcVT)) @@ -21678,6 +21728,32 @@ if (VT == MVT::f128) return SDValue(); + if (SVT == MVT::f16) { + if (VT != MVT::f32) + return SDValue(); + + In = DAG.getBitcast(MVT::i16, In); + In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, + getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In, + DAG.getIntPtrConstant(0, DL)); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other}, + {Op.getOperand(0), In}); + Chain = Res.getValue(1); + } else { + Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In, + DAG.getTargetConstant(4, DL, MVT::i32)); + } + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res, + DAG.getIntPtrConstant(0, DL)); + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + return Res; + } else if (!SVT.isVector()) { + return Op; + } + assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); SDValue Res = @@ -21690,9 +21766,43 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); + MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); + MVT SVT = In.getSimpleValueType(); + + if (VT == MVT::f16) { + if (SVT != MVT::f32) + return SDValue(); + + SDLoc DL(Op); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32, + DAG.getConstantFP(0, DL, MVT::v4f32), In, + DAG.getIntPtrConstant(0, DL)); + Res = DAG.getNode( + X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other}, + {Op.getOperand(0), Res, DAG.getTargetConstant(4, DL, MVT::i32)}); + Chain = Res.getValue(1); + } else { + // FIXME: Should we use zeros for upper elements for non-strict? + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In); + Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, + DAG.getTargetConstant(4, DL, MVT::i32)); + } + + Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res, + DAG.getIntPtrConstant(0, DL)); + Res = DAG.getBitcast(MVT::f16, Res); + + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, DL); + + return Res; + } + // It's legal except when f128 is involved - if (In.getSimpleValueType() != MVT::f128) + if (SVT != MVT::f128) return Op; return SDValue(); @@ -23448,9 +23558,26 @@ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); + SDValue Op2 = Op.getOperand(IsStrict ? 3 : 2); SDLoc dl(Op); - ISD::CondCode CC = - cast(Op.getOperand(IsStrict ? 3 : 2))->get(); + ISD::CondCode CC = cast(Op2)->get(); + + if (Op0.getValueType() == MVT::f16) { + SDValue Res; + if (IsStrict) { + Op0 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, + {Chain, Op0}); + Op1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other}, + {Chain, Op1}); + Res = DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, + {Chain, Op0, Op1, Op2}); + } else { + Op0 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op0); + Op1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op1); + Res = DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1, Op2); + } + return Res; + } // Handle f128 first, since one possible outcome is a normal integer // comparison which gets handled by emitFlagsForSetcc. @@ -40185,7 +40312,7 @@ // Check if we have a bitcast from another integer type as well. if (!((Subtarget.hasSSE1() && VT == MVT::f32) || - (Subtarget.hasSSE2() && VT == MVT::f64) || + (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::f16)) || (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() && TLI.isTypeLegal(VT)))) return SDValue(); @@ -41519,12 +41646,15 @@ SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); + EVT VT = LHS.getValueType(); + if (VT.isSimple() && VT.getSimpleVT() == MVT::f16) + return SDValue(); + // Try simplification again because we use this function to optimize // BLENDV nodes that are not handled by the generic combiner. if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS)) return V; - EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()); @@ -51726,7 +51856,8 @@ /// Check if \p RC is a vector register class. /// I.e., FR* / VR* or one of their variant. static bool isFRClass(const TargetRegisterClass &RC) { - return RC.hasSuperClassEq(&X86::FR32XRegClass) || + return RC.hasSuperClassEq(&X86::FR16XRegClass) || + RC.hasSuperClassEq(&X86::FR32XRegClass) || RC.hasSuperClassEq(&X86::FR64XRegClass) || RC.hasSuperClassEq(&X86::VR128XRegClass) || RC.hasSuperClassEq(&X86::VR256XRegClass) || diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -505,6 +505,8 @@ // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in { + def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "", + [(set FR16X:$dst, fp16imm0)]>; def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "", [(set FR32X:$dst, fp32imm0)]>; def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "", @@ -10952,6 +10954,13 @@ defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>; defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W; +let Predicates = [HasBWI] in { + def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWZrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16X)>; + def : Pat<(store f16:$src, addr:$dst), (VPEXTRWZmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; + def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWZrr (v8i16 (COPY_TO_REGCLASS FR16X:$src, VR128X)), 0), sub_16bit)>; + def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWZrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16X)>; +} + //===----------------------------------------------------------------------===// // VSHUFPS - VSHUFPD Operations //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -557,9 +557,12 @@ let Predicates = [HasSSE1,NoAVX512] in defm _FR32 : CMOVrr_PSEUDO; - let Predicates = [HasSSE2,NoAVX512] in + let Predicates = [HasSSE2,NoAVX512] in { + defm _FR16 : CMOVrr_PSEUDO; defm _FR64 : CMOVrr_PSEUDO; + } let Predicates = [HasAVX512] in { + defm _FR16X : CMOVrr_PSEUDO; defm _FR32X : CMOVrr_PSEUDO; defm _FR64X : CMOVrr_PSEUDO; } @@ -591,6 +594,8 @@ (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; def : Pat<(v4i32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; + def : Pat<(v8f16 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), + (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; def : Pat<(v4f32 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), (CMOV_VR128 VR128:$t, VR128:$f, timm:$cond)>; def : Pat<(v2f64 (X86cmov VR128:$t, VR128:$f, timm:$cond, EFLAGS)), @@ -602,6 +607,8 @@ (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; def : Pat<(v8i32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; + def : Pat<(v16f16 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), + (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; def : Pat<(v8f32 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), (CMOV_VR256 VR256:$t, VR256:$f, timm:$cond)>; def : Pat<(v4f64 (X86cmov VR256:$t, VR256:$f, timm:$cond, EFLAGS)), @@ -614,6 +621,8 @@ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + def : Pat<(v8f16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; def : Pat<(v2f64 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), @@ -625,6 +634,8 @@ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; + def : Pat<(v16f16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; def : Pat<(v4f64 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), @@ -637,6 +648,8 @@ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +def : Pat<(v32f16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; def : Pat<(v8f64 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -796,6 +796,7 @@ //===----------------------------------------------------------------------===// // 128-bit load pattern fragments +def loadv8f16 : PatFrag<(ops node:$ptr), (v8f16 (load node:$ptr))>; def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; @@ -804,6 +805,7 @@ def loadv16i8 : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>; // 256-bit load pattern fragments +def loadv16f16 : PatFrag<(ops node:$ptr), (v16f16 (load node:$ptr))>; def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; @@ -812,6 +814,7 @@ def loadv32i8 : PatFrag<(ops node:$ptr), (v32i8 (load node:$ptr))>; // 512-bit load pattern fragments +def loadv32f16 : PatFrag<(ops node:$ptr), (v32f16 (load node:$ptr))>; def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; @@ -823,6 +826,10 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; def extloadv4f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; def extloadv8f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; +def extloadv2f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>; +def extloadv4f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>; +def extloadv8f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>; +def extloadv16f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>; // Like 'store', but always requires vector size alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), @@ -839,6 +846,8 @@ // 128-bit aligned load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 +def alignedloadv8f16 : PatFrag<(ops node:$ptr), + (v8f16 (alignedload node:$ptr))>; def alignedloadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (alignedload node:$ptr))>; def alignedloadv2f64 : PatFrag<(ops node:$ptr), @@ -854,6 +863,8 @@ // 256-bit aligned load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 +def alignedloadv16f16 : PatFrag<(ops node:$ptr), + (v16f16 (alignedload node:$ptr))>; def alignedloadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (alignedload node:$ptr))>; def alignedloadv4f64 : PatFrag<(ops node:$ptr), @@ -868,6 +879,8 @@ (v32i8 (alignedload node:$ptr))>; // 512-bit aligned load pattern fragments +def alignedloadv32f16 : PatFrag<(ops node:$ptr), + (v32f16 (alignedload node:$ptr))>; def alignedloadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (alignedload node:$ptr))>; def alignedloadv8f64 : PatFrag<(ops node:$ptr), @@ -926,6 +939,11 @@ def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>; def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>; +def X86vzload16 : PatFrag<(ops node:$src), + (X86vzld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 2; +}]>; + def X86vzload32 : PatFrag<(ops node:$src), (X86vzld node:$src), [{ return cast(N)->getMemoryVT().getStoreSize() == 4; @@ -976,6 +994,10 @@ // only load a single element. // FIXME: We should add more canolicalizing in DAGCombine. Particulary removing // the simple_load case. +def sse_load_f16 : PatFrags<(ops node:$ptr), + [(v8f16 (simple_load node:$ptr)), + (v8f16 (X86vzload16 node:$ptr)), + (v8f16 (scalar_to_vector (loadf16 node:$ptr)))]>; def sse_load_f32 : PatFrags<(ops node:$ptr), [(v4f32 (simple_load node:$ptr)), (v4f32 (X86vzload32 node:$ptr)), @@ -985,9 +1007,13 @@ (v2f64 (X86vzload64 node:$ptr)), (v2f64 (scalar_to_vector (loadf64 node:$ptr)))]>; +def shmem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; +def fp16imm0 : PatLeaf<(f16 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; def fp32imm0 : PatLeaf<(f32 fpimm), [{ return N->isExactlyValue(+0.0); diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -981,10 +981,12 @@ case X86::AVX512_512_SETALLONES: case X86::AVX512_FsFLD0SD: case X86::AVX512_FsFLD0SS: + case X86::AVX512_FsFLD0SH: case X86::AVX512_FsFLD0F128: case X86::AVX_SET0: case X86::FsFLD0SD: case X86::FsFLD0SS: + case X86::FsFLD0SH: case X86::FsFLD0F128: case X86::KSET0D: case X86::KSET0Q: @@ -3796,12 +3798,12 @@ const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); + MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) && "Stack slot too small for store"); if (RC->getID() == X86::TILERegClassID) { unsigned Opc = X86::TILESTORED; // tilestored %tmm, (%sp, %idx) - MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); MachineInstr *NewMI = @@ -3810,6 +3812,23 @@ MachineOperand &MO = NewMI->getOperand(2); MO.setReg(VirtReg); MO.setIsKill(true); + } else if (RC->getID() == X86::FR16RegClassID || + RC->getID() == X86::FR16XRegClassID) { + if (Subtarget.hasSSE41() || Subtarget.hasAVX() || Subtarget.hasBWI()) { + unsigned Opc = Subtarget.hasBWI() ? X86::VPEXTRWZmr + : Subtarget.hasAVX() ? X86::VPEXTRWmr + : X86::PEXTRWmr; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)) + .addImm(0); + } else { + Register VirtReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); + BuildMI(MBB, MI, DebugLoc(), get(X86::MOVPDI2DIrr), VirtReg) + .addReg(SrcReg, getKillRegState(isKill)); + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(X86::MOV16mr)), + FrameIdx) + .addReg(VirtReg, getKillRegState(isKill), X86::sub_16bit); + } } else { unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = @@ -3826,10 +3845,10 @@ Register DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { + MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); if (RC->getID() == X86::TILERegClassID) { unsigned Opc = X86::TILELOADD; // tileloadd (%sp, %idx), %tmm - MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo(); Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); MachineInstr *NewMI = BuildMI(MBB, MI, DebugLoc(), get(X86::MOV64ri), VirtReg).addImm(64); @@ -3838,6 +3857,15 @@ MachineOperand &MO = NewMI->getOperand(3); MO.setReg(VirtReg); MO.setIsKill(true); + } else if (RC->getID() == X86::FR16RegClassID || + RC->getID() == X86::FR16XRegClassID) { + unsigned Opc = Subtarget.hasBWI() ? X86::VPINSRWZrm + : Subtarget.hasAVX() ? X86::VPINSRWrm + : X86::PINSRWrm; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg) + .addReg(DestReg, RegState::Undef), + FrameIdx) + .addImm(0); } else { const MachineFunction &MF = *MBB.getParent(); const MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -4688,6 +4716,7 @@ case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: + case X86::FsFLD0SH: case X86::FsFLD0F128: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::AVX_SET0: { @@ -4701,6 +4730,7 @@ return true; } case X86::AVX512_128_SET0: + case X86::AVX512_FsFLD0SH: case X86::AVX512_FsFLD0SS: case X86::AVX512_FsFLD0SD: case X86::AVX512_FsFLD0F128: { @@ -6047,6 +6077,10 @@ case X86::AVX512_FsFLD0SS: Alignment = Align(4); break; + case X86::FsFLD0SH: + case X86::AVX512_FsFLD0SH: + Alignment = Align(2); + break; default: return nullptr; } @@ -6086,6 +6120,8 @@ case X86::AVX512_FsFLD0SD: case X86::FsFLD0SS: case X86::AVX512_FsFLD0SS: + case X86::FsFLD0SH: + case X86::AVX512_FsFLD0SH: case X86::FsFLD0F128: case X86::AVX512_FsFLD0F128: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. @@ -6114,7 +6150,9 @@ MachineConstantPool &MCP = *MF.getConstantPool(); Type *Ty; unsigned Opc = LoadMI.getOpcode(); - if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS) + if (Opc == X86::FsFLD0SH || Opc == X86::AVX512_FsFLD0SH) + Ty = Type::getHalfTy(MF.getFunction().getContext()); + else if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS) Ty = Type::getFloatTy(MF.getFunction().getContext()); else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction().getContext()); diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -421,6 +421,7 @@ def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>; def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>; def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>; +def f16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand>; @@ -1193,6 +1194,7 @@ }]>; def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf16 : PatFrag<(ops node:$ptr), (f16 (load node:$ptr))>; def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -112,6 +112,8 @@ // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { + def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "", + [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>; def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", @@ -3968,6 +3970,19 @@ } // ExeDomain = SSEPackedInt +let Predicates = [HasSSE2] in { + def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; + def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>; + def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; + def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; +} + +let Predicates = [HasAVX, NoBWI] in { + def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; + def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; + def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; +} + //===---------------------------------------------------------------------===// // SSE2 - Packed Mask Creation //===---------------------------------------------------------------------===// @@ -5193,6 +5208,12 @@ defm PEXTRW : SS41I_extract16<0x15, "pextrw">; +let Predicates = [UseSSE41] in + def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; + +let Predicates = [HasAVX, NoBWI] in + def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; + /// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination multiclass SS41I_extract32 opc, string OpcodeStr> { diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td --- a/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -16,7 +16,9 @@ //===----------------------------------------------------------------------===// let Predicates = [NoAVX512] in { - // A vector extract of the first f32/f64 position is a subregister copy + // A vector extract of the first f16/f32/f64 position is a subregister copy + def : Pat<(f16 (extractelt (v8f16 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v8f16 VR128:$src), FR16)>; def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), @@ -24,7 +26,9 @@ } let Predicates = [HasAVX512] in { - // A vector extract of the first f32/f64 position is a subregister copy + // A vector extract of the first f16/f32/f64 position is a subregister copy + def : Pat<(f16 (extractelt (v8f16 VR128X:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v8f16 VR128X:$src), FR16X)>; def : Pat<(f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X)>; def : Pat<(f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))), @@ -32,6 +36,9 @@ } let Predicates = [NoVLX] in { + // Implicitly promote a 16-bit scalar to a vector. + def : Pat<(v8f16 (scalar_to_vector FR16X:$src)), + (COPY_TO_REGCLASS FR16X:$src, VR128)>; // Implicitly promote a 32-bit scalar to a vector. def : Pat<(v4f32 (scalar_to_vector FR32:$src)), (COPY_TO_REGCLASS FR32:$src, VR128)>; @@ -41,6 +48,9 @@ } let Predicates = [HasVLX] in { + // Implicitly promote a 16-bit scalar to a vector. + def : Pat<(v8f16 (scalar_to_vector FR16X:$src)), + (COPY_TO_REGCLASS FR16X:$src, VR128X)>; // Implicitly promote a 32-bit scalar to a vector. def : Pat<(v4f32 (scalar_to_vector FR32X:$src)), (COPY_TO_REGCLASS FR32X:$src, VR128X)>; @@ -74,6 +84,7 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; +defm : subvector_subreg_lowering; // A 128-bit subvector extract from the first 512-bit vector position is a // subregister copy that needs no instruction. Likewise, a 128-bit subvector @@ -85,6 +96,7 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; +defm : subvector_subreg_lowering; // A 128-bit subvector extract from the first 512-bit vector position is a // subregister copy that needs no instruction. Likewise, a 128-bit subvector @@ -96,6 +108,7 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; +defm : subvector_subreg_lowering; // If we're inserting into an all zeros vector, just use a plain move which @@ -114,6 +127,7 @@ let Predicates = [HasAVX, NoVLX] in { defm : subvec_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, sub_xmm>; defm : subvec_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"APS", VR128, v16f16, v8f16, v8i32, sub_xmm>; defm : subvec_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, sub_xmm>; defm : subvec_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, sub_xmm>; defm : subvec_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, sub_xmm>; @@ -123,6 +137,7 @@ let Predicates = [HasVLX] in { defm : subvec_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, sub_xmm>; defm : subvec_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"APSZ128", VR128X, v16f16, v8f16, v8i32, sub_xmm>; defm : subvec_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, sub_xmm>; defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, sub_xmm>; defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, sub_xmm>; @@ -130,6 +145,7 @@ defm : subvec_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, sub_xmm>; defm : subvec_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"APSZ128", VR128X, v32f16, v8f16, v16i32, sub_xmm>; defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, sub_xmm>; defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, sub_xmm>; defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, sub_xmm>; @@ -137,6 +153,7 @@ defm : subvec_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, sub_ymm>; defm : subvec_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"APSZ256", VR256X, v32f16, v16f16, v16i32, sub_ymm>; defm : subvec_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, sub_ymm>; defm : subvec_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, sub_ymm>; defm : subvec_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, sub_ymm>; @@ -146,6 +163,7 @@ let Predicates = [HasAVX512, NoVLX] in { defm : subvec_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, sub_xmm>; defm : subvec_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"APS", VR128, v32f16, v8f16, v16i32, sub_xmm>; defm : subvec_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, sub_xmm>; defm : subvec_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, sub_xmm>; defm : subvec_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, sub_xmm>; @@ -153,6 +171,7 @@ defm : subvec_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, sub_ymm>; defm : subvec_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"APSY", VR256, v32f16, v16f16, v16i32, sub_ymm>; defm : subvec_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, sub_ymm>; defm : subvec_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, sub_ymm>; defm : subvec_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, sub_ymm>; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -537,6 +537,8 @@ def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; +def FR16 : RegisterClass<"X86", [f16], 16, (add FR32)>; + // FIXME: This sets up the floating point register files as though they are f64 // values, though they really are f80 values. This will cause us to spill @@ -567,9 +569,9 @@ // Generic vector registers: VR64 and VR128. // Ensure that float types are declared first - only float is legal on SSE1. def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; -def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], +def VR128 : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128], 128, (add FR32)>; -def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], +def VR256 : RegisterClass<"X86", [v8f32, v4f64, v16f16, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 15)>; // Status flags registers. @@ -587,7 +589,7 @@ } // AVX-512 vector/mask registers. -def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v32f16, v64i8, v32i16, v16i32, v8i64], 512, (sequence "ZMM%u", 0, 31)>; // Represents the lower 16 registers that have VEX/legacy encodable subregs. @@ -599,10 +601,12 @@ def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; +def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)>; + // Extended VR128 and VR256 for AVX-512 instructions -def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], +def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128], 128, (add FR32X)>; -def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], +def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 31)>; // Mask registers diff --git a/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir b/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir --- a/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir +++ b/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir @@ -28,8 +28,8 @@ liveins: $rdi, $rsi ; CHECK-LABEL: name: test - ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4325386 /* regdef:GR64 */, def $rsi, 4325386 /* regdef:GR64 */, def dead $rdi, - INLINEASM &foo, 0, 4325386, def $rsi, 4325386, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags + ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4456458 /* regdef:GR64 */, def $rsi, 4456458 /* regdef:GR64 */, def dead $rdi, + INLINEASM &foo, 0, 4456458, def $rsi, 4456458, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags $rax = MOV64rr killed $rsi RETQ killed $rax ... @@ -45,8 +45,8 @@ ; Verify that the register ties are preserved. ; CHECK-LABEL: name: test2 - ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4325386 /* regdef:GR64 */, def $rsi, 4325386 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags - INLINEASM &foo, 0, 4325386, def $rsi, 4325386, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags + ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4456458 /* regdef:GR64 */, def $rsi, 4456458 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags + INLINEASM &foo, 0, 4456458, def $rsi, 4456458, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags $rax = MOV64rr killed $rsi RETQ killed $rax ... diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll --- a/llvm/test/CodeGen/X86/atomic-non-integer.ll +++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll @@ -15,17 +15,40 @@ ; (Specifically, there were reviewer questions about the lowering for halfs ; and their calling convention which remain unresolved.) +; FIXME: We should remove support for SSE1 and NOSSE. define void @store_half(half* %fptr, half %v) { -; X86-LABEL: store_half: -; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movw %ax, (%ecx) -; X86-NEXT: retl +; X86-SSE1-LABEL: store_half: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE1-NEXT: movw %ax, (%ecx) +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: store_half: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: movw %cx, (%eax) +; X86-SSE2-NEXT: retl +; +; X86-AVX-LABEL: store_half: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-AVX-NEXT: movw %cx, (%eax) +; X86-AVX-NEXT: retl +; +; X86-NOSSE-LABEL: store_half: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: movw %ax, (%ecx) +; X86-NOSSE-NEXT: retl ; ; X64-LABEL: store_half: ; X64: # %bb.0: -; X64-NEXT: movw %si, (%rdi) +; X64-NEXT: pextrw $0, %xmm0, %eax +; X64-NEXT: movw %ax, (%rdi) ; X64-NEXT: retq store atomic half %v, half* %fptr unordered, align 2 ret void @@ -192,16 +215,38 @@ ret void } +; FIXME: We should remove support for SSE1 and NOSSE. define half @load_half(half* %fptr) { -; X86-LABEL: load_half: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %eax -; X86-NEXT: retl +; X86-SSE1-LABEL: load_half: +; X86-SSE1: # %bb.0: +; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE1-NEXT: movzwl (%eax), %eax +; X86-SSE1-NEXT: retl +; +; X86-SSE2-LABEL: load_half: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: movzwl (%eax), %eax +; X86-SSE2-NEXT: pinsrw $0, %eax, %xmm0 +; X86-SSE2-NEXT: retl +; +; X86-AVX-LABEL: load_half: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: movzwl (%eax), %eax +; X86-AVX-NEXT: pinsrw $0, %eax, %xmm0 +; X86-AVX-NEXT: retl +; +; X86-NOSSE-LABEL: load_half: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movzwl (%eax), %eax +; X86-NOSSE-NEXT: retl ; ; X64-LABEL: load_half: ; X64: # %bb.0: ; X64-NEXT: movzwl (%rdi), %eax +; X64-NEXT: pinsrw $0, %eax, %xmm0 ; X64-NEXT: retq %v = load atomic half, half* %fptr unordered, align 2 ret half %v diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -2189,22 +2189,20 @@ ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kshiftlw $1, %k2, %k2 ; KNL-NEXT: korw %k2, %k1, %k1 -; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k1 -; KNL-NEXT: kmovw %k1, %ecx -; KNL-NEXT: xorl %eax, %eax -; KNL-NEXT: testb $1, %cl -; KNL-NEXT: movl $0, %ecx -; KNL-NEXT: je LBB85_2 -; KNL-NEXT: ## %bb.1: -; KNL-NEXT: movzwl 2(%rsi), %ecx -; KNL-NEXT: LBB85_2: -; KNL-NEXT: kmovw %k0, %edi -; KNL-NEXT: testb $1, %dil -; KNL-NEXT: je LBB85_4 -; KNL-NEXT: ## %bb.3: +; KNL-NEXT: kandw %k1, %k0, %k1 +; KNL-NEXT: kshiftrw $1, %k1, %k2 ; KNL-NEXT: movzwl (%rsi), %eax -; KNL-NEXT: LBB85_4: +; KNL-NEXT: movzwl 2(%rsi), %ecx +; KNL-NEXT: vmovd %ecx, %xmm0 +; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 +; KNL-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k2} {z} +; KNL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; KNL-NEXT: vmovd %xmm0, %ecx +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vcvtph2ps %xmm0, %xmm0 +; KNL-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z} +; KNL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: movw %ax, (%rdx) ; KNL-NEXT: movw %cx, 2(%rdx) ; KNL-NEXT: retq @@ -2239,22 +2237,20 @@ ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $7, %k2, %k2 ; SKX-NEXT: korw %k1, %k2, %k1 -; SKX-NEXT: kandw %k1, %k0, %k0 -; SKX-NEXT: kshiftrb $1, %k0, %k1 -; SKX-NEXT: kmovd %k1, %ecx -; SKX-NEXT: xorl %eax, %eax -; SKX-NEXT: testb $1, %cl -; SKX-NEXT: movl $0, %ecx -; SKX-NEXT: je LBB85_2 -; SKX-NEXT: ## %bb.1: -; SKX-NEXT: movzwl 2(%rsi), %ecx -; SKX-NEXT: LBB85_2: -; SKX-NEXT: kmovd %k0, %edi -; SKX-NEXT: testb $1, %dil -; SKX-NEXT: je LBB85_4 -; SKX-NEXT: ## %bb.3: +; SKX-NEXT: kandw %k1, %k0, %k1 +; SKX-NEXT: kshiftrb $1, %k1, %k2 ; SKX-NEXT: movzwl (%rsi), %eax -; SKX-NEXT: LBB85_4: +; SKX-NEXT: movzwl 2(%rsi), %ecx +; SKX-NEXT: vmovd %ecx, %xmm0 +; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 +; SKX-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k2} {z} +; SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, %ecx +; SKX-NEXT: vmovd %eax, %xmm0 +; SKX-NEXT: vcvtph2ps %xmm0, %xmm0 +; SKX-NEXT: vmovss %xmm0, %xmm0, %xmm0 {%k1} {z} +; SKX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; SKX-NEXT: vmovd %xmm0, %eax ; SKX-NEXT: movw %ax, (%rdx) ; SKX-NEXT: movw %cx, 2(%rdx) ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll --- a/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll +++ b/llvm/test/CodeGen/X86/avx512-masked_memop-16-8.ll @@ -152,213 +152,161 @@ declare void @llvm.masked.store.v32i16.p0v32i16(<32 x i16>, <32 x i16>*, i32, <32 x i1>) -; Make sure we scalarize masked loads of f16. define <16 x half> @test_mask_load_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) { ; CHECK-LABEL: test_mask_load_16xf16: ; CHECK: ## %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 56 -; CHECK-NEXT: .cfi_offset %rbx, -56 -; CHECK-NEXT: .cfi_offset %r12, -48 -; CHECK-NEXT: .cfi_offset %r13, -40 -; CHECK-NEXT: .cfi_offset %r14, -32 -; CHECK-NEXT: .cfi_offset %r15, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 -; CHECK-NEXT: vpmovmskb %xmm0, %r11d -; CHECK-NEXT: testb $1, %r11b +; CHECK-NEXT: vpmovmskb %xmm0, %ecx +; CHECK-NEXT: testb $1, %cl ; CHECK-NEXT: je LBB12_1 ; CHECK-NEXT: ## %bb.2: ## %cond.load -; CHECK-NEXT: movzwl (%rsi), %ecx -; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; CHECK-NEXT: pinsrw $0, (%rsi), %xmm8 ; CHECK-NEXT: jmp LBB12_3 ; CHECK-NEXT: LBB12_1: -; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill +; CHECK-NEXT: vpxor %xmm8, %xmm8, %xmm8 ; CHECK-NEXT: LBB12_3: ## %else -; CHECK-NEXT: xorl %edi, %edi -; CHECK-NEXT: movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill -; CHECK-NEXT: movl %edi, %ecx -; CHECK-NEXT: testb $2, %r11b +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; CHECK-NEXT: vmovdqa %xmm2, %xmm10 +; CHECK-NEXT: vmovdqa %xmm2, %xmm4 +; CHECK-NEXT: vmovdqa %xmm2, %xmm5 +; CHECK-NEXT: vmovdqa %xmm2, %xmm6 +; CHECK-NEXT: vmovdqa %xmm2, %xmm7 +; CHECK-NEXT: vmovdqa %xmm2, %xmm1 +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 +; CHECK-NEXT: vmovdqa %xmm2, %xmm3 +; CHECK-NEXT: vmovdqa %xmm2, %xmm11 +; CHECK-NEXT: vmovdqa %xmm2, %xmm12 +; CHECK-NEXT: vmovdqa %xmm2, %xmm13 +; CHECK-NEXT: vmovdqa %xmm2, %xmm14 +; CHECK-NEXT: testb $2, %cl ; CHECK-NEXT: je LBB12_4 ; CHECK-NEXT: ## %bb.5: ## %cond.load1 -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movl %edi, %r12d -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movl %edi, %ebp -; CHECK-NEXT: movl %edi, %r13d -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movl %edi, %r8d -; CHECK-NEXT: movl %edi, %r9d -; CHECK-NEXT: movl %edi, %r10d -; CHECK-NEXT: movl %edi, %r15d -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movzwl 2(%rsi), %edi -; CHECK-NEXT: ## kill: def $di killed $di def $edi -; CHECK-NEXT: testb $4, %r11b +; CHECK-NEXT: vmovdqa %xmm2, %xmm15 +; CHECK-NEXT: pinsrw $0, 2(%rsi), %xmm2 +; CHECK-NEXT: testb $4, %cl ; CHECK-NEXT: jne LBB12_7 ; CHECK-NEXT: jmp LBB12_8 ; CHECK-NEXT: LBB12_4: -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movl %edi, %r12d -; CHECK-NEXT: movl %edi, %ebx -; CHECK-NEXT: movl %edi, %ebp -; CHECK-NEXT: movl %edi, %r13d -; CHECK-NEXT: movl %edi, %r14d -; CHECK-NEXT: movl %edi, %r8d -; CHECK-NEXT: movl %edi, %r9d -; CHECK-NEXT: movl %edi, %r10d -; CHECK-NEXT: movl %edi, %r15d -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: movw %di, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: testb $4, %r11b +; CHECK-NEXT: vmovdqa %xmm2, %xmm15 +; CHECK-NEXT: testb $4, %cl ; CHECK-NEXT: je LBB12_8 ; CHECK-NEXT: LBB12_7: ## %cond.load4 -; CHECK-NEXT: movzwl 4(%rsi), %ecx -; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; CHECK-NEXT: pinsrw $0, 4(%rsi), %xmm10 ; CHECK-NEXT: LBB12_8: ## %else5 -; CHECK-NEXT: testb $8, %r11b +; CHECK-NEXT: testb $8, %cl ; CHECK-NEXT: jne LBB12_9 ; CHECK-NEXT: ## %bb.10: ## %else8 -; CHECK-NEXT: testb $16, %r11b +; CHECK-NEXT: testb $16, %cl ; CHECK-NEXT: jne LBB12_11 ; CHECK-NEXT: LBB12_12: ## %else11 -; CHECK-NEXT: testb $32, %r11b +; CHECK-NEXT: testb $32, %cl ; CHECK-NEXT: jne LBB12_13 ; CHECK-NEXT: LBB12_14: ## %else14 -; CHECK-NEXT: testb $64, %r11b +; CHECK-NEXT: testb $64, %cl ; CHECK-NEXT: jne LBB12_15 ; CHECK-NEXT: LBB12_16: ## %else17 -; CHECK-NEXT: testb $-128, %r11b +; CHECK-NEXT: testb $-128, %cl ; CHECK-NEXT: jne LBB12_17 ; CHECK-NEXT: LBB12_18: ## %else20 -; CHECK-NEXT: testl $256, %r11d ## imm = 0x100 +; CHECK-NEXT: testl $256, %ecx ## imm = 0x100 ; CHECK-NEXT: jne LBB12_19 ; CHECK-NEXT: LBB12_20: ## %else23 -; CHECK-NEXT: testl $512, %r11d ## imm = 0x200 +; CHECK-NEXT: testl $512, %ecx ## imm = 0x200 ; CHECK-NEXT: jne LBB12_21 ; CHECK-NEXT: LBB12_22: ## %else26 -; CHECK-NEXT: testl $1024, %r11d ## imm = 0x400 +; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400 ; CHECK-NEXT: jne LBB12_23 ; CHECK-NEXT: LBB12_24: ## %else29 -; CHECK-NEXT: testl $2048, %r11d ## imm = 0x800 +; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800 ; CHECK-NEXT: jne LBB12_25 ; CHECK-NEXT: LBB12_26: ## %else32 -; CHECK-NEXT: testl $4096, %r11d ## imm = 0x1000 -; CHECK-NEXT: je LBB12_28 -; CHECK-NEXT: LBB12_27: ## %cond.load34 -; CHECK-NEXT: movzwl 24(%rsi), %edx +; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000 +; CHECK-NEXT: jne LBB12_27 ; CHECK-NEXT: LBB12_28: ## %else35 -; CHECK-NEXT: movw %dx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: testl $8192, %r11d ## imm = 0x2000 +; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000 ; CHECK-NEXT: jne LBB12_29 -; CHECK-NEXT: ## %bb.30: ## %else38 -; CHECK-NEXT: testl $16384, %r11d ## imm = 0x4000 +; CHECK-NEXT: LBB12_30: ## %else38 +; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000 ; CHECK-NEXT: jne LBB12_31 ; CHECK-NEXT: LBB12_32: ## %else41 -; CHECK-NEXT: testl $32768, %r11d ## imm = 0x8000 -; CHECK-NEXT: je LBB12_33 -; CHECK-NEXT: LBB12_34: ## %cond.load43 -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload -; CHECK-NEXT: movzwl 30(%rsi), %esi -; CHECK-NEXT: jmp LBB12_35 +; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000 +; CHECK-NEXT: je LBB12_34 +; CHECK-NEXT: LBB12_33: ## %cond.load43 +; CHECK-NEXT: pinsrw $0, 30(%rsi), %xmm9 +; CHECK-NEXT: LBB12_34: ## %else44 +; CHECK-NEXT: vpextrw $0, %xmm8, (%rax) +; CHECK-NEXT: vpextrw $0, %xmm2, 2(%rax) +; CHECK-NEXT: vpextrw $0, %xmm10, 4(%rax) +; CHECK-NEXT: vpextrw $0, %xmm4, 6(%rax) +; CHECK-NEXT: vpextrw $0, %xmm5, 8(%rax) +; CHECK-NEXT: vpextrw $0, %xmm6, 10(%rax) +; CHECK-NEXT: vpextrw $0, %xmm7, 12(%rax) +; CHECK-NEXT: vpextrw $0, %xmm1, 14(%rax) +; CHECK-NEXT: vpextrw $0, %xmm0, 16(%rax) +; CHECK-NEXT: vpextrw $0, %xmm3, 18(%rax) +; CHECK-NEXT: vpextrw $0, %xmm11, 20(%rax) +; CHECK-NEXT: vpextrw $0, %xmm12, 22(%rax) +; CHECK-NEXT: vpextrw $0, %xmm13, 24(%rax) +; CHECK-NEXT: vpextrw $0, %xmm14, 26(%rax) +; CHECK-NEXT: vpextrw $0, %xmm15, 28(%rax) +; CHECK-NEXT: vpextrw $0, %xmm9, 30(%rax) +; CHECK-NEXT: retq ; CHECK-NEXT: LBB12_9: ## %cond.load7 -; CHECK-NEXT: movzwl 6(%rsi), %r12d -; CHECK-NEXT: testb $16, %r11b +; CHECK-NEXT: pinsrw $0, 6(%rsi), %xmm4 +; CHECK-NEXT: testb $16, %cl ; CHECK-NEXT: je LBB12_12 ; CHECK-NEXT: LBB12_11: ## %cond.load10 -; CHECK-NEXT: movzwl 8(%rsi), %ebx -; CHECK-NEXT: testb $32, %r11b +; CHECK-NEXT: pinsrw $0, 8(%rsi), %xmm5 +; CHECK-NEXT: testb $32, %cl ; CHECK-NEXT: je LBB12_14 ; CHECK-NEXT: LBB12_13: ## %cond.load13 -; CHECK-NEXT: movzwl 10(%rsi), %ebp -; CHECK-NEXT: testb $64, %r11b +; CHECK-NEXT: pinsrw $0, 10(%rsi), %xmm6 +; CHECK-NEXT: testb $64, %cl ; CHECK-NEXT: je LBB12_16 ; CHECK-NEXT: LBB12_15: ## %cond.load16 -; CHECK-NEXT: movzwl 12(%rsi), %r13d -; CHECK-NEXT: testb $-128, %r11b +; CHECK-NEXT: pinsrw $0, 12(%rsi), %xmm7 +; CHECK-NEXT: testb $-128, %cl ; CHECK-NEXT: je LBB12_18 ; CHECK-NEXT: LBB12_17: ## %cond.load19 -; CHECK-NEXT: movzwl 14(%rsi), %r14d -; CHECK-NEXT: testl $256, %r11d ## imm = 0x100 +; CHECK-NEXT: pinsrw $0, 14(%rsi), %xmm1 +; CHECK-NEXT: testl $256, %ecx ## imm = 0x100 ; CHECK-NEXT: je LBB12_20 ; CHECK-NEXT: LBB12_19: ## %cond.load22 -; CHECK-NEXT: movzwl 16(%rsi), %r8d -; CHECK-NEXT: testl $512, %r11d ## imm = 0x200 +; CHECK-NEXT: pinsrw $0, 16(%rsi), %xmm0 +; CHECK-NEXT: testl $512, %ecx ## imm = 0x200 ; CHECK-NEXT: je LBB12_22 ; CHECK-NEXT: LBB12_21: ## %cond.load25 -; CHECK-NEXT: movzwl 18(%rsi), %r9d -; CHECK-NEXT: testl $1024, %r11d ## imm = 0x400 +; CHECK-NEXT: pinsrw $0, 18(%rsi), %xmm3 +; CHECK-NEXT: testl $1024, %ecx ## imm = 0x400 ; CHECK-NEXT: je LBB12_24 ; CHECK-NEXT: LBB12_23: ## %cond.load28 -; CHECK-NEXT: movzwl 20(%rsi), %r10d -; CHECK-NEXT: testl $2048, %r11d ## imm = 0x800 +; CHECK-NEXT: pinsrw $0, 20(%rsi), %xmm11 +; CHECK-NEXT: testl $2048, %ecx ## imm = 0x800 ; CHECK-NEXT: je LBB12_26 ; CHECK-NEXT: LBB12_25: ## %cond.load31 -; CHECK-NEXT: movzwl 22(%rsi), %r15d -; CHECK-NEXT: testl $4096, %r11d ## imm = 0x1000 -; CHECK-NEXT: jne LBB12_27 -; CHECK-NEXT: jmp LBB12_28 +; CHECK-NEXT: pinsrw $0, 22(%rsi), %xmm12 +; CHECK-NEXT: testl $4096, %ecx ## imm = 0x1000 +; CHECK-NEXT: je LBB12_28 +; CHECK-NEXT: LBB12_27: ## %cond.load34 +; CHECK-NEXT: pinsrw $0, 24(%rsi), %xmm13 +; CHECK-NEXT: testl $8192, %ecx ## imm = 0x2000 +; CHECK-NEXT: je LBB12_30 ; CHECK-NEXT: LBB12_29: ## %cond.load37 -; CHECK-NEXT: movzwl 26(%rsi), %ecx -; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: testl $16384, %r11d ## imm = 0x4000 +; CHECK-NEXT: pinsrw $0, 26(%rsi), %xmm14 +; CHECK-NEXT: testl $16384, %ecx ## imm = 0x4000 ; CHECK-NEXT: je LBB12_32 ; CHECK-NEXT: LBB12_31: ## %cond.load40 -; CHECK-NEXT: movzwl 28(%rsi), %ecx -; CHECK-NEXT: movw %cx, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; CHECK-NEXT: testl $32768, %r11d ## imm = 0x8000 -; CHECK-NEXT: jne LBB12_34 -; CHECK-NEXT: LBB12_33: -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %esi ## 4-byte Reload -; CHECK-NEXT: LBB12_35: ## %else44 -; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edx ## 4-byte Reload -; CHECK-NEXT: movw %dx, (%rax) -; CHECK-NEXT: movw %di, 2(%rax) -; CHECK-NEXT: movw %cx, 4(%rax) -; CHECK-NEXT: movw %r12w, 6(%rax) -; CHECK-NEXT: movw %bx, 8(%rax) -; CHECK-NEXT: movw %bp, 10(%rax) -; CHECK-NEXT: movw %r13w, 12(%rax) -; CHECK-NEXT: movw %r14w, 14(%rax) -; CHECK-NEXT: movw %r8w, 16(%rax) -; CHECK-NEXT: movw %r9w, 18(%rax) -; CHECK-NEXT: movw %r10w, 20(%rax) -; CHECK-NEXT: movw %r15w, 22(%rax) -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload -; CHECK-NEXT: movw %cx, 24(%rax) -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload -; CHECK-NEXT: movw %cx, 26(%rax) -; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %ecx ## 2-byte Folded Reload -; CHECK-NEXT: movw %cx, 28(%rax) -; CHECK-NEXT: movw %si, 30(%rax) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: retq +; CHECK-NEXT: pinsrw $0, 28(%rsi), %xmm15 +; CHECK-NEXT: testl $32768, %ecx ## imm = 0x8000 +; CHECK-NEXT: jne LBB12_33 +; CHECK-NEXT: jmp LBB12_34 %res = call <16 x half> @llvm.masked.load.v16f16(<16 x half>* %addr, i32 4, <16 x i1>%mask, <16 x half> zeroinitializer) ret <16 x half> %res } declare <16 x half> @llvm.masked.load.v16f16(<16 x half>*, i32, <16 x i1>, <16 x half>) -; Make sure we scalarize masked stores of f16. define void @test_mask_store_16xf16(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) { ; CHECK-LABEL: test_mask_store_16xf16: ; CHECK: ## %bb.0: @@ -414,78 +362,76 @@ ; CHECK-NEXT: LBB13_32: ## %else30 ; CHECK-NEXT: retq ; CHECK-NEXT: LBB13_1: ## %cond.store -; CHECK-NEXT: movw %si, (%rdi) +; CHECK-NEXT: vpextrw $0, %xmm1, (%rdi) ; CHECK-NEXT: testb $2, %al ; CHECK-NEXT: je LBB13_4 ; CHECK-NEXT: LBB13_3: ## %cond.store1 -; CHECK-NEXT: movw %dx, 2(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm2, 2(%rdi) ; CHECK-NEXT: testb $4, %al ; CHECK-NEXT: je LBB13_6 ; CHECK-NEXT: LBB13_5: ## %cond.store3 -; CHECK-NEXT: movw %cx, 4(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm3, 4(%rdi) ; CHECK-NEXT: testb $8, %al ; CHECK-NEXT: je LBB13_8 ; CHECK-NEXT: LBB13_7: ## %cond.store5 -; CHECK-NEXT: movw %r8w, 6(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm4, 6(%rdi) ; CHECK-NEXT: testb $16, %al ; CHECK-NEXT: je LBB13_10 ; CHECK-NEXT: LBB13_9: ## %cond.store7 -; CHECK-NEXT: movw %r9w, 8(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm5, 8(%rdi) ; CHECK-NEXT: testb $32, %al ; CHECK-NEXT: je LBB13_12 ; CHECK-NEXT: LBB13_11: ## %cond.store9 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 10(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm6, 10(%rdi) ; CHECK-NEXT: testb $64, %al ; CHECK-NEXT: je LBB13_14 ; CHECK-NEXT: LBB13_13: ## %cond.store11 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 12(%rdi) +; CHECK-NEXT: vpextrw $0, %xmm7, 12(%rdi) ; CHECK-NEXT: testb $-128, %al ; CHECK-NEXT: je LBB13_16 ; CHECK-NEXT: LBB13_15: ## %cond.store13 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 14(%rdi) +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 14(%rdi) ; CHECK-NEXT: testl $256, %eax ## imm = 0x100 ; CHECK-NEXT: je LBB13_18 ; CHECK-NEXT: LBB13_17: ## %cond.store15 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 16(%rdi) +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 16(%rdi) ; CHECK-NEXT: testl $512, %eax ## imm = 0x200 ; CHECK-NEXT: je LBB13_20 ; CHECK-NEXT: LBB13_19: ## %cond.store17 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 18(%rdi) +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 18(%rdi) ; CHECK-NEXT: testl $1024, %eax ## imm = 0x400 ; CHECK-NEXT: je LBB13_22 ; CHECK-NEXT: LBB13_21: ## %cond.store19 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 20(%rdi) +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 20(%rdi) ; CHECK-NEXT: testl $2048, %eax ## imm = 0x800 ; CHECK-NEXT: je LBB13_24 ; CHECK-NEXT: LBB13_23: ## %cond.store21 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 22(%rdi) +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 22(%rdi) ; CHECK-NEXT: testl $4096, %eax ## imm = 0x1000 ; CHECK-NEXT: je LBB13_26 ; CHECK-NEXT: LBB13_25: ## %cond.store23 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 24(%rdi) +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 24(%rdi) ; CHECK-NEXT: testl $8192, %eax ## imm = 0x2000 ; CHECK-NEXT: je LBB13_28 ; CHECK-NEXT: LBB13_27: ## %cond.store25 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 26(%rdi) +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 26(%rdi) ; CHECK-NEXT: testl $16384, %eax ## imm = 0x4000 ; CHECK-NEXT: je LBB13_30 ; CHECK-NEXT: LBB13_29: ## %cond.store27 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx -; CHECK-NEXT: movw %cx, 28(%rdi) +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 28(%rdi) ; CHECK-NEXT: testl $32768, %eax ## imm = 0x8000 ; CHECK-NEXT: je LBB13_32 ; CHECK-NEXT: LBB13_31: ## %cond.store29 -; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: movw %ax, 30(%rdi) +; CHECK-NEXT: pinsrw $0, {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: vpextrw $0, %xmm0, 30(%rdi) ; CHECK-NEXT: retq call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask) ret void diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll --- a/llvm/test/CodeGen/X86/cvt16.ll +++ b/llvm/test/CodeGen/X86/cvt16.ll @@ -29,7 +29,7 @@ ; LIBCALL-NEXT: .cfi_offset %rbx, -16 ; LIBCALL-NEXT: movq %rdi, %rbx ; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; LIBCALL-NEXT: movw %ax, (%rbx) +; LIBCALL-NEXT: pextrw $0, %xmm0, (%rbx) ; LIBCALL-NEXT: popq %rbx ; LIBCALL-NEXT: .cfi_def_cfa_offset 8 ; LIBCALL-NEXT: retq @@ -37,7 +37,8 @@ ; F16C-LABEL: test1: ; F16C: # %bb.0: ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vpextrw $0, %xmm0, (%rdi) +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movw %ax, (%rdi) ; F16C-NEXT: retq ; ; SOFTFLOAT-LABEL: test1: @@ -59,7 +60,7 @@ define float @test2(i16* nocapture %src) { ; LIBCALL-LABEL: test2: ; LIBCALL: # %bb.0: -; LIBCALL-NEXT: movzwl (%rdi), %edi +; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 ; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL ; ; F16C-LABEL: test2: @@ -89,7 +90,6 @@ ; LIBCALL-NEXT: pushq %rax ; LIBCALL-NEXT: .cfi_def_cfa_offset 16 ; LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; LIBCALL-NEXT: movzwl %ax, %edi ; LIBCALL-NEXT: popq %rax ; LIBCALL-NEXT: .cfi_def_cfa_offset 8 ; LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL @@ -97,6 +97,9 @@ ; F16C-LABEL: test3: ; F16C: # %bb.0: ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: movzwl %ax, %eax +; F16C-NEXT: vmovd %eax, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: retq ; @@ -118,22 +121,13 @@ define double @test4(i16* nocapture %src) { ; LIBCALL-LABEL: test4: ; LIBCALL: # %bb.0: -; LIBCALL-NEXT: pushq %rax -; LIBCALL-NEXT: .cfi_def_cfa_offset 16 -; LIBCALL-NEXT: movzwl (%rdi), %edi -; LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 -; LIBCALL-NEXT: popq %rax -; LIBCALL-NEXT: .cfi_def_cfa_offset 8 -; LIBCALL-NEXT: retq +; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; LIBCALL-NEXT: jmp __extendhfdf2@PLT # TAILCALL ; ; F16C-LABEL: test4: ; F16C: # %bb.0: -; F16C-NEXT: movzwl (%rdi), %eax -; F16C-NEXT: vmovd %eax, %xmm0 -; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; F16C-NEXT: retq +; F16C-NEXT: pinsrw $0, (%rdi), %xmm0 +; F16C-NEXT: jmp __extendhfdf2@PLT # TAILCALL ; ; SOFTFLOAT-LABEL: test4: ; SOFTFLOAT: # %bb.0: @@ -154,11 +148,25 @@ define i16 @test5(double %src) { ; LIBCALL-LABEL: test5: ; LIBCALL: # %bb.0: -; LIBCALL-NEXT: jmp __truncdfhf2@PLT # TAILCALL +; LIBCALL-NEXT: pushq %rax +; LIBCALL-NEXT: .cfi_def_cfa_offset 16 +; LIBCALL-NEXT: callq __truncdfhf2@PLT +; LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax +; LIBCALL-NEXT: popq %rcx +; LIBCALL-NEXT: .cfi_def_cfa_offset 8 +; LIBCALL-NEXT: retq ; ; F16C-LABEL: test5: ; F16C: # %bb.0: -; F16C-NEXT: jmp __truncdfhf2@PLT # TAILCALL +; F16C-NEXT: pushq %rax +; F16C-NEXT: .cfi_def_cfa_offset 16 +; F16C-NEXT: callq __truncdfhf2@PLT +; F16C-NEXT: pextrw $0, %xmm0, %eax +; F16C-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-NEXT: popq %rcx +; F16C-NEXT: .cfi_def_cfa_offset 8 +; F16C-NEXT: retq ; ; SOFTFLOAT-LABEL: test5: ; SOFTFLOAT: # %bb.0: diff --git a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll --- a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll +++ b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll @@ -1,50 +1,36 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=ALL --check-prefix=F16C -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=ALL +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=ALL define zeroext i16 @test1_fast(double %d) #0 { -; F16C-LABEL: test1_fast: -; F16C: # %bb.0: # %entry -; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-NEXT: retq -; -; AVX-LABEL: test1_fast: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq __truncdfhf2@PLT -; AVX-NEXT: popq %rcx -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; ALL-LABEL: test1_fast: +; ALL: # %bb.0: # %entry +; ALL-NEXT: pushq %rax +; ALL-NEXT: .cfi_def_cfa_offset 16 +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: pextrw $0, %xmm0, %eax +; ALL-NEXT: # kill: def $ax killed $ax killed $eax +; ALL-NEXT: popq %rcx +; ALL-NEXT: .cfi_def_cfa_offset 8 +; ALL-NEXT: retq entry: %0 = tail call i16 @llvm.convert.to.fp16.f64(double %d) ret i16 %0 } define zeroext i16 @test2_fast(x86_fp80 %d) #0 { -; F16C-LABEL: test2_fast: -; F16C: # %bb.0: # %entry -; F16C-NEXT: fldt {{[0-9]+}}(%rsp) -; F16C-NEXT: fstps -{{[0-9]+}}(%rsp) -; F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-NEXT: retq -; -; AVX-LABEL: test2_fast: -; AVX: # %bb.0: # %entry -; AVX-NEXT: subq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 32 -; AVX-NEXT: fldt {{[0-9]+}}(%rsp) -; AVX-NEXT: fstpt (%rsp) -; AVX-NEXT: callq __truncxfhf2@PLT -; AVX-NEXT: addq $24, %rsp -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; ALL-LABEL: test2_fast: +; ALL: # %bb.0: # %entry +; ALL-NEXT: subq $24, %rsp +; ALL-NEXT: .cfi_def_cfa_offset 32 +; ALL-NEXT: fldt {{[0-9]+}}(%rsp) +; ALL-NEXT: fstpt (%rsp) +; ALL-NEXT: callq __truncxfhf2@PLT +; ALL-NEXT: pextrw $0, %xmm0, %eax +; ALL-NEXT: # kill: def $ax killed $ax killed $eax +; ALL-NEXT: addq $24, %rsp +; ALL-NEXT: .cfi_def_cfa_offset 8 +; ALL-NEXT: retq entry: %0 = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %d) ret i16 %0 @@ -56,6 +42,8 @@ ; ALL-NEXT: pushq %rax ; ALL-NEXT: .cfi_def_cfa_offset 16 ; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: pextrw $0, %xmm0, %eax +; ALL-NEXT: # kill: def $ax killed $ax killed $eax ; ALL-NEXT: popq %rcx ; ALL-NEXT: .cfi_def_cfa_offset 8 ; ALL-NEXT: retq @@ -72,6 +60,8 @@ ; ALL-NEXT: fldt {{[0-9]+}}(%rsp) ; ALL-NEXT: fstpt (%rsp) ; ALL-NEXT: callq __truncxfhf2@PLT +; ALL-NEXT: pextrw $0, %xmm0, %eax +; ALL-NEXT: # kill: def $ax killed $ax killed $eax ; ALL-NEXT: addq $24, %rsp ; ALL-NEXT: .cfi_def_cfa_offset 8 ; ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll --- a/llvm/test/CodeGen/X86/fmf-flags.ll +++ b/llvm/test/CodeGen/X86/fmf-flags.ll @@ -111,11 +111,9 @@ ; X64: # %bb.0: ; X64-NEXT: pushq %rax ; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: callq __gnu_f2h_ieee@PLT -; X64-NEXT: movzwl %ax, %edi ; X64-NEXT: popq %rax ; X64-NEXT: .cfi_def_cfa_offset 8 ; X64-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll --- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll @@ -2109,8 +2109,9 @@ ; X86-SSE-LABEL: test_signed_i1_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2125,7 +2126,6 @@ ; X64-LABEL: test_signed_i1_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: maxss %xmm0, %xmm1 @@ -2192,8 +2192,9 @@ ; X86-SSE-LABEL: test_signed_i8_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2208,7 +2209,6 @@ ; X64-LABEL: test_signed_i8_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: maxss %xmm0, %xmm1 @@ -2276,8 +2276,9 @@ ; X86-SSE-LABEL: test_signed_i13_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2292,7 +2293,6 @@ ; X64-LABEL: test_signed_i13_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: maxss %xmm0, %xmm1 @@ -2360,8 +2360,9 @@ ; X86-SSE-LABEL: test_signed_i16_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2376,7 +2377,6 @@ ; X64-LABEL: test_signed_i16_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: maxss %xmm0, %xmm1 @@ -2444,8 +2444,9 @@ ; X86-SSE-LABEL: test_signed_i19_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2461,7 +2462,6 @@ ; X64-LABEL: test_signed_i19_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: ucomiss %xmm0, %xmm0 @@ -2529,8 +2529,9 @@ ; X86-SSE-LABEL: test_signed_i32_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2547,7 +2548,6 @@ ; X64-LABEL: test_signed_i32_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %eax ; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2634,8 +2634,9 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $24, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2669,7 +2670,6 @@ ; X64-LABEL: test_signed_i50_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %rax ; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2759,8 +2759,9 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $24, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2794,7 +2795,6 @@ ; X64-LABEL: test_signed_i64_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %rax ; X64-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2908,9 +2908,10 @@ ; X86-SSE-NEXT: pushl %edi ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $44, %esp +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl %eax, (%esp) @@ -2963,7 +2964,6 @@ ; X64-LABEL: test_signed_i100_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: callq __fixsfti@PLT @@ -3093,9 +3093,10 @@ ; X86-SSE-NEXT: pushl %edi ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $44, %esp +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl %eax, (%esp) @@ -3144,7 +3145,6 @@ ; X64-LABEL: test_signed_i128_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: callq __fixsfti@PLT diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll --- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll @@ -1925,11 +1925,12 @@ ; X86-SSE-LABEL: test_unsigned_i1_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorps %xmm0, %xmm0 +; X86-SSE-NEXT: pxor %xmm0, %xmm0 ; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE-NEXT: minss %xmm0, %xmm1 @@ -1941,7 +1942,6 @@ ; X64-LABEL: test_unsigned_i1_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: maxss %xmm0, %xmm1 @@ -2000,11 +2000,12 @@ ; X86-SSE-LABEL: test_unsigned_i8_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorps %xmm0, %xmm0 +; X86-SSE-NEXT: pxor %xmm0, %xmm0 ; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE-NEXT: minss %xmm0, %xmm1 @@ -2016,7 +2017,6 @@ ; X64-LABEL: test_unsigned_i8_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: maxss %xmm0, %xmm1 @@ -2074,11 +2074,12 @@ ; X86-SSE-LABEL: test_unsigned_i13_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorps %xmm0, %xmm0 +; X86-SSE-NEXT: pxor %xmm0, %xmm0 ; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE-NEXT: minss %xmm0, %xmm1 @@ -2090,7 +2091,6 @@ ; X64-LABEL: test_unsigned_i13_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: maxss %xmm0, %xmm1 @@ -2148,11 +2148,12 @@ ; X86-SSE-LABEL: test_unsigned_i16_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) -; X86-SSE-NEXT: xorps %xmm0, %xmm0 +; X86-SSE-NEXT: pxor %xmm0, %xmm0 ; X86-SSE-NEXT: maxss {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-SSE-NEXT: minss %xmm0, %xmm1 @@ -2164,7 +2165,6 @@ ; X64-LABEL: test_unsigned_i16_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: maxss %xmm0, %xmm1 @@ -2222,8 +2222,9 @@ ; X86-SSE-LABEL: test_unsigned_i19_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2237,7 +2238,6 @@ ; X64-LABEL: test_unsigned_i19_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: xorps %xmm1, %xmm1 ; X64-NEXT: maxss %xmm1, %xmm0 @@ -2293,8 +2293,9 @@ ; X86-SSE-LABEL: test_unsigned_i32_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $12, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2319,7 +2320,6 @@ ; X64-LABEL: test_unsigned_i32_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %rax ; X64-NEXT: xorl %ecx, %ecx @@ -2409,8 +2409,9 @@ ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $24, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2455,7 +2456,6 @@ ; X64-LABEL: test_unsigned_i50_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: cvttss2si %xmm0, %rax ; X64-NEXT: xorl %ecx, %ecx @@ -2543,8 +2543,9 @@ ; X86-SSE-LABEL: test_unsigned_i64_f16: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: subl $28, %esp -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: fstps {{[0-9]+}}(%esp) ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -2587,7 +2588,6 @@ ; X64-LABEL: test_unsigned_i64_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movaps %xmm0, %xmm2 @@ -2693,9 +2693,10 @@ ; X86-SSE-NEXT: pushl %edi ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $32, %esp +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl %eax, (%esp) @@ -2743,7 +2744,6 @@ ; X64-LABEL: test_unsigned_i100_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: callq __fixunssfti@PLT @@ -2844,9 +2844,10 @@ ; X86-SSE-NEXT: pushl %edi ; X86-SSE-NEXT: pushl %esi ; X86-SSE-NEXT: subl $32, %esp +; X86-SSE-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: pextrw $0, %xmm0, %eax +; X86-SSE-NEXT: movw %ax, (%esp) ; X86-SSE-NEXT: calll __gnu_h2f_ieee ; X86-SSE-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movl %eax, (%esp) @@ -2892,7 +2893,6 @@ ; X64-LABEL: test_unsigned_i128_f16: ; X64: # %bb.0: ; X64-NEXT: pushq %rax -; X64-NEXT: movzwl %di, %edi ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; X64-NEXT: callq __fixunssfti@PLT diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll --- a/llvm/test/CodeGen/X86/freeze.ll +++ b/llvm/test/CodeGen/X86/freeze.ll @@ -38,14 +38,10 @@ ; X86ASM: # %bb.0: ; X86ASM-NEXT: pushq %rax ; X86ASM-NEXT: .cfi_def_cfa_offset 16 -; X86ASM-NEXT: xorl %edi, %edi -; X86ASM-NEXT: callq __gnu_h2f_ieee@PLT -; X86ASM-NEXT: callq __gnu_f2h_ieee@PLT -; X86ASM-NEXT: movzwl %ax, %edi ; X86ASM-NEXT: callq __gnu_h2f_ieee@PLT ; X86ASM-NEXT: addss %xmm0, %xmm0 ; X86ASM-NEXT: callq __gnu_f2h_ieee@PLT -; X86ASM-NEXT: popq %rcx +; X86ASM-NEXT: popq %rax ; X86ASM-NEXT: .cfi_def_cfa_offset 8 ; X86ASM-NEXT: retq %y1 = freeze half undef diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll --- a/llvm/test/CodeGen/X86/half-constrained.ll +++ b/llvm/test/CodeGen/X86/half-constrained.ll @@ -33,7 +33,7 @@ ; X64-NOF16C: ## %bb.0: ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 -; X64-NOF16C-NEXT: movzwl _a(%rip), %edi +; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0 ; X64-NOF16C-NEXT: callq ___extendhfsf2 ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq @@ -74,18 +74,18 @@ ; X64-NOF16C: ## %bb.0: ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 -; X64-NOF16C-NEXT: movzwl _a(%rip), %edi -; X64-NOF16C-NEXT: callq ___extendhfsf2 -; X64-NOF16C-NEXT: cvtss2sd %xmm0, %xmm0 +; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0 +; X64-NOF16C-NEXT: callq ___extendhfdf2 ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq ; ; X64-F16C-LABEL: half_to_double: ; X64-F16C: ## %bb.0: -; X64-F16C-NEXT: movzwl _a(%rip), %eax -; X64-F16C-NEXT: vmovd %eax, %xmm0 -; X64-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; X64-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; X64-F16C-NEXT: pushq %rax +; X64-F16C-NEXT: .cfi_def_cfa_offset 16 +; X64-F16C-NEXT: pinsrw $0, _a(%rip), %xmm0 +; X64-F16C-NEXT: callq ___extendhfdf2 +; X64-F16C-NEXT: popq %rax ; X64-F16C-NEXT: retq %1 = load half, half* @a, align 2 %2 = tail call double @llvm.experimental.constrained.fpext.f64.f16(half %1, metadata !"fpexcept.strict") #0 @@ -117,22 +117,18 @@ ; X64-NOF16C: ## %bb.0: ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 -; X64-NOF16C-NEXT: movzwl _a(%rip), %edi -; X64-NOF16C-NEXT: callq ___extendhfsf2 -; X64-NOF16C-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) -; X64-NOF16C-NEXT: flds {{[0-9]+}}(%rsp) -; X64-NOF16C-NEXT: wait +; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0 +; X64-NOF16C-NEXT: callq ___extendhfxf2 ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq ; ; X64-F16C-LABEL: half_to_fp80: ; X64-F16C: ## %bb.0: -; X64-F16C-NEXT: movzwl _a(%rip), %eax -; X64-F16C-NEXT: vmovd %eax, %xmm0 -; X64-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; X64-F16C-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) -; X64-F16C-NEXT: flds -{{[0-9]+}}(%rsp) -; X64-F16C-NEXT: wait +; X64-F16C-NEXT: pushq %rax +; X64-F16C-NEXT: .cfi_def_cfa_offset 16 +; X64-F16C-NEXT: pinsrw $0, _a(%rip), %xmm0 +; X64-F16C-NEXT: callq ___extendhfxf2 +; X64-F16C-NEXT: popq %rax ; X64-F16C-NEXT: retq %1 = load half, half* @a, align 2 %2 = tail call x86_fp80 @llvm.experimental.constrained.fpext.f80.f16(half %1, metadata !"fpexcept.strict") #0 @@ -169,6 +165,7 @@ ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X64-NOF16C-NEXT: callq ___truncsfhf2 +; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax ; X64-NOF16C-NEXT: movw %ax, _a(%rip) ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq @@ -178,7 +175,8 @@ ; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X64-F16C-NEXT: vpextrw $0, %xmm0, _a(%rip) +; X64-F16C-NEXT: vmovd %xmm0, %eax +; X64-F16C-NEXT: movw %ax, _a(%rip) ; X64-F16C-NEXT: retq %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f32(float %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 store half %2, half* @a, align 2 @@ -215,6 +213,7 @@ ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X64-NOF16C-NEXT: callq ___truncdfhf2 +; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax ; X64-NOF16C-NEXT: movw %ax, _a(%rip) ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq @@ -224,7 +223,7 @@ ; X64-F16C-NEXT: pushq %rax ; X64-F16C-NEXT: .cfi_def_cfa_offset 16 ; X64-F16C-NEXT: callq ___truncdfhf2 -; X64-F16C-NEXT: movw %ax, _a(%rip) +; X64-F16C-NEXT: vpextrw $0, %xmm0, _a(%rip) ; X64-F16C-NEXT: popq %rax ; X64-F16C-NEXT: retq %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f64(double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 @@ -265,6 +264,7 @@ ; X64-NOF16C-NEXT: fstpt (%rsp) ; X64-NOF16C-NEXT: wait ; X64-NOF16C-NEXT: callq ___truncxfhf2 +; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax ; X64-NOF16C-NEXT: movw %ax, _a(%rip) ; X64-NOF16C-NEXT: addq $24, %rsp ; X64-NOF16C-NEXT: retq @@ -277,7 +277,7 @@ ; X64-F16C-NEXT: fstpt (%rsp) ; X64-F16C-NEXT: wait ; X64-F16C-NEXT: callq ___truncxfhf2 -; X64-F16C-NEXT: movw %ax, _a(%rip) +; X64-F16C-NEXT: vpextrw $0, %xmm0, _a(%rip) ; X64-F16C-NEXT: addq $24, %rsp ; X64-F16C-NEXT: retq %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f80(x86_fp80 %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 @@ -332,13 +332,14 @@ ; X64-NOF16C: ## %bb.0: ; X64-NOF16C-NEXT: pushq %rax ; X64-NOF16C-NEXT: .cfi_def_cfa_offset 16 -; X64-NOF16C-NEXT: movzwl _a(%rip), %edi +; X64-NOF16C-NEXT: pinsrw $0, _a(%rip), %xmm0 ; X64-NOF16C-NEXT: callq ___extendhfsf2 -; X64-NOF16C-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill -; X64-NOF16C-NEXT: movzwl _b(%rip), %edi +; X64-NOF16C-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill +; X64-NOF16C-NEXT: pinsrw $0, _b(%rip), %xmm0 ; X64-NOF16C-NEXT: callq ___extendhfsf2 ; X64-NOF16C-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Folded Reload ; X64-NOF16C-NEXT: callq ___truncsfhf2 +; X64-NOF16C-NEXT: pextrw $0, %xmm0, %eax ; X64-NOF16C-NEXT: movw %ax, _c(%rip) ; X64-NOF16C-NEXT: popq %rax ; X64-NOF16C-NEXT: retq @@ -355,7 +356,8 @@ ; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X64-F16C-NEXT: vpextrw $0, %xmm0, _c(%rip) +; X64-F16C-NEXT: vmovd %xmm0, %eax +; X64-F16C-NEXT: movw %ax, _c(%rip) ; X64-F16C-NEXT: retq %1 = load half, half* @a, align 2 %2 = tail call float @llvm.experimental.constrained.fpext.f32.f16(half %1, metadata !"fpexcept.strict") #0 diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=1 \ -; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON,BWON-NOF16C +; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWON ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-f16c -fixup-byte-word-insts=0 \ ; RUN: | FileCheck %s -check-prefixes=CHECK,CHECK-LIBCALL,BWOFF ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+f16c -fixup-byte-word-insts=1 \ @@ -9,23 +9,25 @@ ; RUN: | FileCheck %s -check-prefixes=CHECK-I686 define void @test_load_store(half* %in, half* %out) #0 { -; BWON-LABEL: test_load_store: -; BWON: # %bb.0: -; BWON-NEXT: movzwl (%rdi), %eax -; BWON-NEXT: movw %ax, (%rsi) -; BWON-NEXT: retq +; CHECK-LIBCALL-LABEL: test_load_store: +; CHECK-LIBCALL: # %bb.0: +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, (%rsi) +; CHECK-LIBCALL-NEXT: retq ; -; BWOFF-LABEL: test_load_store: -; BWOFF: # %bb.0: -; BWOFF-NEXT: movw (%rdi), %ax -; BWOFF-NEXT: movw %ax, (%rsi) -; BWOFF-NEXT: retq +; BWON-F16C-LABEL: test_load_store: +; BWON-F16C: # %bb.0: +; BWON-F16C-NEXT: pinsrw $0, (%rdi), %xmm0 +; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rsi) +; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_load_store: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-I686-NEXT: movw (%ecx), %cx +; CHECK-I686-NEXT: pinsrw $0, (%ecx), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %ecx ; CHECK-I686-NEXT: movw %cx, (%eax) ; CHECK-I686-NEXT: retl %val = load half, half* %in @@ -74,7 +76,7 @@ define float @test_extend32(half* %addr) #0 { ; CHECK-LIBCALL-LABEL: test_extend32: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 ; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL ; ; BWON-F16C-LABEL: test_extend32: @@ -88,8 +90,9 @@ ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: subl $12, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl @@ -99,30 +102,19 @@ } define double @test_extend64(half* %addr) #0 { -; CHECK-LIBCALL-LABEL: test_extend64: -; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: popq %rax -; CHECK-LIBCALL-NEXT: retq -; -; BWON-F16C-LABEL: test_extend64: -; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: retq +; CHECK-LABEL: test_extend64: +; CHECK: # %bb.0: +; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-NEXT: jmp __extendhfdf2@PLT # TAILCALL ; ; CHECK-I686-LABEL: test_extend64: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: subl $12, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) +; CHECK-I686-NEXT: calll __extendhfdf2 ; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl %val16 = load half, half* %addr @@ -136,6 +128,7 @@ ; CHECK-LIBCALL-NEXT: pushq %rbx ; CHECK-LIBCALL-NEXT: movq %rdi, %rbx ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) ; CHECK-LIBCALL-NEXT: popq %rbx ; CHECK-LIBCALL-NEXT: retq @@ -143,7 +136,8 @@ ; BWON-F16C-LABEL: test_trunc32: ; BWON-F16C: # %bb.0: ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rdi) +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: movw %ax, (%rdi) ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_trunc32: @@ -151,9 +145,10 @@ ; CHECK-I686-NEXT: pushl %esi ; CHECK-I686-NEXT: subl $8, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: movd %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esi) ; CHECK-I686-NEXT: addl $8, %esp ; CHECK-I686-NEXT: popl %esi @@ -164,23 +159,34 @@ } define void @test_trunc64(double %in, half* %addr) #0 { -; CHECK-LABEL: test_trunc64: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: callq __truncdfhf2@PLT -; CHECK-NEXT: movw %ax, (%rbx) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: retq +; CHECK-LIBCALL-LABEL: test_trunc64: +; CHECK-LIBCALL: # %bb.0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: movq %rdi, %rbx +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: test_trunc64: +; BWON-F16C: # %bb.0: +; BWON-F16C-NEXT: pushq %rbx +; BWON-F16C-NEXT: movq %rdi, %rbx +; BWON-F16C-NEXT: callq __truncdfhf2@PLT +; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rbx) +; BWON-F16C-NEXT: popq %rbx +; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_trunc64: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: pushl %esi ; CHECK-I686-NEXT: subl $8, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-I686-NEXT: movsd %xmm0, (%esp) +; CHECK-I686-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; CHECK-I686-NEXT: movq %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esi) ; CHECK-I686-NEXT: addl $8, %esp ; CHECK-I686-NEXT: popl %esi @@ -191,32 +197,22 @@ } define i64 @test_fptosi_i64(half* %p) #0 { -; CHECK-LIBCALL-LABEL: test_fptosi_i64: -; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax -; CHECK-LIBCALL-NEXT: popq %rcx -; CHECK-LIBCALL-NEXT: retq -; -; BWON-F16C-LABEL: test_fptosi_i64: -; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax -; BWON-F16C-NEXT: retq +; CHECK-LABEL: test_fptosi_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq ; ; CHECK-I686-LABEL: test_fptosi_i64: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: subl $12, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: fstps (%esp) -; CHECK-I686-NEXT: calll __fixsfdi +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) +; CHECK-I686-NEXT: calll __fixhfdi ; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl %a = load half, half* %p, align 2 @@ -229,33 +225,34 @@ ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rbx ; CHECK-LIBCALL-NEXT: movq %rsi, %rbx -; CHECK-LIBCALL-NEXT: cvtsi2ss %rdi, %xmm0 -; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: callq __floatdihf@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) ; CHECK-LIBCALL-NEXT: popq %rbx ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_sitofp_i64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rsi) +; BWON-F16C-NEXT: pushq %rbx +; BWON-F16C-NEXT: movq %rsi, %rbx +; BWON-F16C-NEXT: callq __floatdihf@PLT +; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rbx) +; BWON-F16C-NEXT: popq %rbx ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_sitofp_i64: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $24, %esp +; CHECK-I686-NEXT: subl $8, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: movss %xmm0, (%esp) -; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: subl $8, %esp +; CHECK-I686-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: calll __floatdihf +; CHECK-I686-NEXT: addl $16, %esp +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esi) -; CHECK-I686-NEXT: addl $24, %esp +; CHECK-I686-NEXT: addl $8, %esp ; CHECK-I686-NEXT: popl %esi ; CHECK-I686-NEXT: retl %r = sitofp i64 %a to half @@ -264,47 +261,22 @@ } define i64 @test_fptoui_i64(half* %p) #0 { -; CHECK-LIBCALL-LABEL: test_fptoui_i64: -; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-LIBCALL-NEXT: movaps %xmm0, %xmm2 -; CHECK-LIBCALL-NEXT: subss %xmm1, %xmm2 -; CHECK-LIBCALL-NEXT: cvttss2si %xmm2, %rax -; CHECK-LIBCALL-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; CHECK-LIBCALL-NEXT: xorq %rax, %rcx -; CHECK-LIBCALL-NEXT: cvttss2si %xmm0, %rax -; CHECK-LIBCALL-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-LIBCALL-NEXT: cmovaeq %rcx, %rax -; CHECK-LIBCALL-NEXT: popq %rcx -; CHECK-LIBCALL-NEXT: retq -; -; BWON-F16C-LABEL: test_fptoui_i64: -; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; BWON-F16C-NEXT: vsubss %xmm1, %xmm0, %xmm2 -; BWON-F16C-NEXT: vcvttss2si %xmm2, %rax -; BWON-F16C-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; BWON-F16C-NEXT: xorq %rax, %rcx -; BWON-F16C-NEXT: vcvttss2si %xmm0, %rax -; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0 -; BWON-F16C-NEXT: cmovaeq %rcx, %rax -; BWON-F16C-NEXT: retq +; CHECK-LABEL: test_fptoui_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-NEXT: callq __fixhfdi@PLT +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: retq ; ; CHECK-I686-LABEL: test_fptoui_i64: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: subl $12, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: fstps (%esp) -; CHECK-I686-NEXT: calll __fixunssfdi +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) +; CHECK-I686-NEXT: calll __fixunshfdi ; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl %a = load half, half* %p, align 2 @@ -317,58 +289,34 @@ ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rbx ; CHECK-LIBCALL-NEXT: movq %rsi, %rbx -; CHECK-LIBCALL-NEXT: testq %rdi, %rdi -; CHECK-LIBCALL-NEXT: js .LBB10_1 -; CHECK-LIBCALL-NEXT: # %bb.2: -; CHECK-LIBCALL-NEXT: cvtsi2ss %rdi, %xmm0 -; CHECK-LIBCALL-NEXT: jmp .LBB10_3 -; CHECK-LIBCALL-NEXT: .LBB10_1: -; CHECK-LIBCALL-NEXT: movq %rdi, %rax -; CHECK-LIBCALL-NEXT: shrq %rax -; CHECK-LIBCALL-NEXT: andl $1, %edi -; CHECK-LIBCALL-NEXT: orq %rax, %rdi -; CHECK-LIBCALL-NEXT: cvtsi2ss %rdi, %xmm0 -; CHECK-LIBCALL-NEXT: addss %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: .LBB10_3: -; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: callq __floatundihf@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) ; CHECK-LIBCALL-NEXT: popq %rbx ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_uitofp_i64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: testq %rdi, %rdi -; BWON-F16C-NEXT: js .LBB10_1 -; BWON-F16C-NEXT: # %bb.2: -; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 -; BWON-F16C-NEXT: jmp .LBB10_3 -; BWON-F16C-NEXT: .LBB10_1: -; BWON-F16C-NEXT: movq %rdi, %rax -; BWON-F16C-NEXT: shrq %rax -; BWON-F16C-NEXT: andl $1, %edi -; BWON-F16C-NEXT: orq %rax, %rdi -; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 -; BWON-F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: .LBB10_3: -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rsi) +; BWON-F16C-NEXT: pushq %rbx +; BWON-F16C-NEXT: movq %rsi, %rbx +; BWON-F16C-NEXT: callq __floatundihf@PLT +; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rbx) +; BWON-F16C-NEXT: popq %rbx ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_uitofp_i64: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $24, %esp +; CHECK-I686-NEXT: subl $8, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-I686-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: shrl $31, %eax -; CHECK-I686-NEXT: fildll {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4) -; CHECK-I686-NEXT: fstps (%esp) -; CHECK-I686-NEXT: calll __gnu_f2h_ieee +; CHECK-I686-NEXT: subl $8, %esp +; CHECK-I686-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: calll __floatundihf +; CHECK-I686-NEXT: addl $16, %esp +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esi) -; CHECK-I686-NEXT: addl $24, %esp +; CHECK-I686-NEXT: addl $8, %esp ; CHECK-I686-NEXT: popl %esi ; CHECK-I686-NEXT: retl %r = uitofp i64 %a to half @@ -379,36 +327,31 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 { ; CHECK-LIBCALL-LABEL: test_extend32_vec4: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: subq $88, %rsp -; CHECK-LIBCALL-NEXT: movl (%rdi), %eax -; CHECK-LIBCALL-NEXT: movl 4(%rdi), %ecx -; CHECK-LIBCALL-NEXT: movl %eax, (%rsp) -; CHECK-LIBCALL-NEXT: movl %ecx, {{[0-9]+}}(%rsp) -; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 -; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; CHECK-LIBCALL-NEXT: subq $72, %rsp +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, 2(%rdi), %xmm0 ; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi +; CHECK-LIBCALL-NEXT: pinsrw $0, 4(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, 6(%rdi), %xmm0 ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: pextrw $1, %xmm0, %edi +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %edi +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-LIBCALL-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0] -; CHECK-LIBCALL-NEXT: addq $88, %rsp +; CHECK-LIBCALL-NEXT: addq $72, %rsp ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_extend32_vec4: @@ -418,32 +361,30 @@ ; ; CHECK-I686-LABEL: test_extend32_vec4: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: subl $124, %esp +; CHECK-I686-NEXT: pushl %esi +; CHECK-I686-NEXT: subl $88, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movl (%eax), %ecx -; CHECK-I686-NEXT: movl 4(%eax), %eax -; CHECK-I686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 -; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-I686-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: pinsrw $0, 6(%eax), %xmm0 ; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-I686-NEXT: pextrw $1, %xmm0, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pinsrw $0, 4(%eax), %xmm0 +; CHECK-I686-NEXT: pinsrw $0, 2(%eax), %xmm1 +; CHECK-I686-NEXT: pextrw $0, %xmm1, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; CHECK-I686-NEXT: movw %si, (%esp) ; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill +; CHECK-I686-NEXT: movw %si, (%esp) ; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: pextrw $1, %xmm0, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi ; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: movw %si, (%esp) ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) @@ -458,7 +399,8 @@ ; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-I686-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-I686-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; CHECK-I686-NEXT: addl $124, %esp +; CHECK-I686-NEXT: addl $88, %esp +; CHECK-I686-NEXT: popl %esi ; CHECK-I686-NEXT: retl %a = load <4 x half>, <4 x half>* %p, align 8 %b = fpext <4 x half> %a to <4 x float> @@ -468,37 +410,31 @@ define <4 x double> @test_extend64_vec4(<4 x half>* %p) #0 { ; CHECK-LIBCALL-LABEL: test_extend64_vec4: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: pushq %rbp -; CHECK-LIBCALL-NEXT: pushq %r14 -; CHECK-LIBCALL-NEXT: pushq %rbx -; CHECK-LIBCALL-NEXT: subq $32, %rsp -; CHECK-LIBCALL-NEXT: movzwl 4(%rdi), %r14d -; CHECK-LIBCALL-NEXT: movzwl 6(%rdi), %ebp -; CHECK-LIBCALL-NEXT: movzwl (%rdi), %ebx -; CHECK-LIBCALL-NEXT: movzwl 2(%rdi), %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movl %ebx, %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-LIBCALL-NEXT: subq $72, %rsp +; CHECK-LIBCALL-NEXT: pinsrw $0, 4(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, 6(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, 2(%rdi), %xmm0 +; CHECK-LIBCALL-NEXT: callq __extendhfdf2@PLT +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: callq __extendhfdf2@PLT +; CHECK-LIBCALL-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movl %ebp, %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: callq __extendhfdf2@PLT ; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-LIBCALL-NEXT: movl %r14d, %edi -; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: cvtss2sd %xmm0, %xmm1 +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: callq __extendhfdf2@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, %xmm1 ; CHECK-LIBCALL-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-LIBCALL-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-LIBCALL-NEXT: addq $32, %rsp -; CHECK-LIBCALL-NEXT: popq %rbx -; CHECK-LIBCALL-NEXT: popq %r14 -; CHECK-LIBCALL-NEXT: popq %rbp +; CHECK-LIBCALL-NEXT: addq $72, %rsp ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_extend64_vec4: @@ -509,39 +445,43 @@ ; ; CHECK-I686-LABEL: test_extend64_vec4: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: pushl %ebx -; CHECK-I686-NEXT: pushl %edi ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $64, %esp +; CHECK-I686-NEXT: subl $104, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl 6(%eax), %esi -; CHECK-I686-NEXT: movzwl (%eax), %edi -; CHECK-I686-NEXT: movzwl 2(%eax), %ebx -; CHECK-I686-NEXT: movzwl 4(%eax), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: pinsrw $0, 6(%eax), %xmm0 +; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: pinsrw $0, 2(%eax), %xmm0 +; CHECK-I686-NEXT: pinsrw $0, 4(%eax), %xmm1 +; CHECK-I686-NEXT: pextrw $0, %xmm1, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi +; CHECK-I686-NEXT: calll __extendhfdf2 ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-I686-NEXT: movl %ebx, (%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: movw %si, (%esp) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi +; CHECK-I686-NEXT: calll __extendhfdf2 ; CHECK-I686-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-I686-NEXT: movl %edi, (%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: movl %esi, (%esp) +; CHECK-I686-NEXT: movw %si, (%esp) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %esi +; CHECK-I686-NEXT: calll __extendhfdf2 +; CHECK-I686-NEXT: movw %si, (%esp) ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) -; CHECK-I686-NEXT: calll __gnu_h2f_ieee +; CHECK-I686-NEXT: calll __extendhfdf2 ; CHECK-I686-NEXT: fstpl {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; CHECK-I686-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; CHECK-I686-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; CHECK-I686-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1] -; CHECK-I686-NEXT: addl $64, %esp +; CHECK-I686-NEXT: addl $104, %esp ; CHECK-I686-NEXT: popl %esi -; CHECK-I686-NEXT: popl %edi -; CHECK-I686-NEXT: popl %ebx ; CHECK-I686-NEXT: retl %a = load <4 x half>, <4 x half>* %p, align 8 %b = fpext <4 x half> %a to <4 x double> @@ -549,71 +489,39 @@ } define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 { -; BWON-NOF16C-LABEL: test_trunc32_vec4: -; BWON-NOF16C: # %bb.0: -; BWON-NOF16C-NEXT: pushq %rbp -; BWON-NOF16C-NEXT: pushq %r15 -; BWON-NOF16C-NEXT: pushq %r14 -; BWON-NOF16C-NEXT: pushq %rbx -; BWON-NOF16C-NEXT: subq $24, %rsp -; BWON-NOF16C-NEXT: movq %rdi, %rbx -; BWON-NOF16C-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; BWON-NOF16C-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee@PLT -; BWON-NOF16C-NEXT: movl %eax, %r14d -; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee@PLT -; BWON-NOF16C-NEXT: movl %eax, %r15d -; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee@PLT -; BWON-NOF16C-NEXT: movl %eax, %ebp -; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: callq __gnu_f2h_ieee@PLT -; BWON-NOF16C-NEXT: movw %ax, (%rbx) -; BWON-NOF16C-NEXT: movw %bp, 6(%rbx) -; BWON-NOF16C-NEXT: movw %r15w, 4(%rbx) -; BWON-NOF16C-NEXT: movw %r14w, 2(%rbx) -; BWON-NOF16C-NEXT: addq $24, %rsp -; BWON-NOF16C-NEXT: popq %rbx -; BWON-NOF16C-NEXT: popq %r14 -; BWON-NOF16C-NEXT: popq %r15 -; BWON-NOF16C-NEXT: popq %rbp -; BWON-NOF16C-NEXT: retq -; -; BWOFF-LABEL: test_trunc32_vec4: -; BWOFF: # %bb.0: -; BWOFF-NEXT: pushq %rbp -; BWOFF-NEXT: pushq %r15 -; BWOFF-NEXT: pushq %r14 -; BWOFF-NEXT: pushq %rbx -; BWOFF-NEXT: subq $24, %rsp -; BWOFF-NEXT: movq %rdi, %rbx -; BWOFF-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill -; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; BWOFF-NEXT: callq __gnu_f2h_ieee@PLT -; BWOFF-NEXT: movw %ax, %r14w -; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWOFF-NEXT: callq __gnu_f2h_ieee@PLT -; BWOFF-NEXT: movw %ax, %r15w -; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWOFF-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; BWOFF-NEXT: callq __gnu_f2h_ieee@PLT -; BWOFF-NEXT: movw %ax, %bp -; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWOFF-NEXT: callq __gnu_f2h_ieee@PLT -; BWOFF-NEXT: movw %ax, (%rbx) -; BWOFF-NEXT: movw %bp, 6(%rbx) -; BWOFF-NEXT: movw %r15w, 4(%rbx) -; BWOFF-NEXT: movw %r14w, 2(%rbx) -; BWOFF-NEXT: addq $24, %rsp -; BWOFF-NEXT: popq %rbx -; BWOFF-NEXT: popq %r14 -; BWOFF-NEXT: popq %r15 -; BWOFF-NEXT: popq %rbp -; BWOFF-NEXT: retq +; CHECK-LIBCALL-LABEL: test_trunc32_vec4: +; CHECK-LIBCALL: # %bb.0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: subq $64, %rsp +; CHECK-LIBCALL-NEXT: movq %rdi, %rbx +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 6(%rbx) +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 4(%rbx) +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 2(%rbx) +; CHECK-LIBCALL-NEXT: addq $64, %rsp +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_trunc32_vec4: ; BWON-F16C: # %bb.0: @@ -622,40 +530,41 @@ ; ; CHECK-I686-LABEL: test_trunc32_vec4: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: pushl %ebp -; CHECK-I686-NEXT: pushl %ebx -; CHECK-I686-NEXT: pushl %edi ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $44, %esp +; CHECK-I686-NEXT: subl $88, %esp ; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-I686-NEXT: movaps %xmm0, %xmm1 ; CHECK-I686-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] ; CHECK-I686-NEXT: movss %xmm1, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movw %ax, %si +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-I686-NEXT: movss %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movw %ax, %di +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-I686-NEXT: movss %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movw %ax, %bx -; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: movd %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movw %ax, (%ebp) -; CHECK-I686-NEXT: movw %bx, 6(%ebp) -; CHECK-I686-NEXT: movw %di, 4(%ebp) -; CHECK-I686-NEXT: movw %si, 2(%ebp) -; CHECK-I686-NEXT: addl $44, %esp +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 6(%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 4(%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 2(%esi) +; CHECK-I686-NEXT: addl $88, %esp ; CHECK-I686-NEXT: popl %esi -; CHECK-I686-NEXT: popl %edi -; CHECK-I686-NEXT: popl %ebx -; CHECK-I686-NEXT: popl %ebp ; CHECK-I686-NEXT: retl %v = fptrunc <4 x float> %a to <4 x half> store <4 x half> %v, <4 x half>* %p @@ -663,143 +572,109 @@ } define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 { -; BWON-NOF16C-LABEL: test_trunc64_vec4: -; BWON-NOF16C: # %bb.0: -; BWON-NOF16C-NEXT: pushq %rbp -; BWON-NOF16C-NEXT: pushq %r15 -; BWON-NOF16C-NEXT: pushq %r14 -; BWON-NOF16C-NEXT: pushq %rbx -; BWON-NOF16C-NEXT: subq $40, %rsp -; BWON-NOF16C-NEXT: movq %rdi, %rbx -; BWON-NOF16C-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; BWON-NOF16C-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWON-NOF16C-NEXT: callq __truncdfhf2@PLT -; BWON-NOF16C-NEXT: movl %eax, %r14d -; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWON-NOF16C-NEXT: callq __truncdfhf2@PLT -; BWON-NOF16C-NEXT: movl %eax, %r15d -; BWON-NOF16C-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: callq __truncdfhf2@PLT -; BWON-NOF16C-NEXT: movl %eax, %ebp -; BWON-NOF16C-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWON-NOF16C-NEXT: callq __truncdfhf2@PLT -; BWON-NOF16C-NEXT: movw %ax, 4(%rbx) -; BWON-NOF16C-NEXT: movw %bp, (%rbx) -; BWON-NOF16C-NEXT: movw %r15w, 6(%rbx) -; BWON-NOF16C-NEXT: movw %r14w, 2(%rbx) -; BWON-NOF16C-NEXT: addq $40, %rsp -; BWON-NOF16C-NEXT: popq %rbx -; BWON-NOF16C-NEXT: popq %r14 -; BWON-NOF16C-NEXT: popq %r15 -; BWON-NOF16C-NEXT: popq %rbp -; BWON-NOF16C-NEXT: retq -; -; BWOFF-LABEL: test_trunc64_vec4: -; BWOFF: # %bb.0: -; BWOFF-NEXT: pushq %rbp -; BWOFF-NEXT: pushq %r15 -; BWOFF-NEXT: pushq %r14 -; BWOFF-NEXT: pushq %rbx -; BWOFF-NEXT: subq $40, %rsp -; BWOFF-NEXT: movq %rdi, %rbx -; BWOFF-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill -; BWOFF-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWOFF-NEXT: callq __truncdfhf2@PLT -; BWOFF-NEXT: movw %ax, %r14w -; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWOFF-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; BWOFF-NEXT: callq __truncdfhf2@PLT -; BWOFF-NEXT: movw %ax, %r15w -; BWOFF-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; BWOFF-NEXT: callq __truncdfhf2@PLT -; BWOFF-NEXT: movw %ax, %bp -; BWOFF-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; BWOFF-NEXT: callq __truncdfhf2@PLT -; BWOFF-NEXT: movw %ax, 4(%rbx) -; BWOFF-NEXT: movw %bp, (%rbx) -; BWOFF-NEXT: movw %r15w, 6(%rbx) -; BWOFF-NEXT: movw %r14w, 2(%rbx) -; BWOFF-NEXT: addq $40, %rsp -; BWOFF-NEXT: popq %rbx -; BWOFF-NEXT: popq %r14 -; BWOFF-NEXT: popq %r15 -; BWOFF-NEXT: popq %rbp -; BWOFF-NEXT: retq +; CHECK-LIBCALL-LABEL: test_trunc64_vec4: +; CHECK-LIBCALL: # %bb.0: +; CHECK-LIBCALL-NEXT: pushq %rbx +; CHECK-LIBCALL-NEXT: subq $64, %rsp +; CHECK-LIBCALL-NEXT: movq %rdi, %rbx +; CHECK-LIBCALL-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 4(%rbx) +; CHECK-LIBCALL-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, (%rbx) +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 6(%rbx) +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-LIBCALL-NEXT: pextrw $0, %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, 2(%rbx) +; CHECK-LIBCALL-NEXT: addq $64, %rsp +; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_trunc64_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: pushq %rbp -; BWON-F16C-NEXT: pushq %r15 -; BWON-F16C-NEXT: pushq %r14 ; BWON-F16C-NEXT: pushq %rbx -; BWON-F16C-NEXT: subq $56, %rsp +; BWON-F16C-NEXT: subq $80, %rsp ; BWON-F16C-NEXT: movq %rdi, %rbx -; BWON-F16C-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; BWON-F16C-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; BWON-F16C-NEXT: vzeroupper ; BWON-F16C-NEXT: callq __truncdfhf2@PLT -; BWON-F16C-NEXT: movl %eax, %r14d -; BWON-F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; BWON-F16C-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BWON-F16C-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; BWON-F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 -; BWON-F16C-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; BWON-F16C-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; BWON-F16C-NEXT: vzeroupper ; BWON-F16C-NEXT: callq __truncdfhf2@PLT -; BWON-F16C-NEXT: movl %eax, %r15d -; BWON-F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; BWON-F16C-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BWON-F16C-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; BWON-F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; BWON-F16C-NEXT: vzeroupper ; BWON-F16C-NEXT: callq __truncdfhf2@PLT -; BWON-F16C-NEXT: movl %eax, %ebp -; BWON-F16C-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; BWON-F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; BWON-F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; BWON-F16C-NEXT: callq __truncdfhf2@PLT -; BWON-F16C-NEXT: movw %ax, 4(%rbx) -; BWON-F16C-NEXT: movw %bp, (%rbx) -; BWON-F16C-NEXT: movw %r15w, 6(%rbx) -; BWON-F16C-NEXT: movw %r14w, 2(%rbx) -; BWON-F16C-NEXT: addq $56, %rsp +; BWON-F16C-NEXT: vpextrw $0, %xmm0, 4(%rbx) +; BWON-F16C-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rbx) +; BWON-F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; BWON-F16C-NEXT: vpextrw $0, %xmm0, 6(%rbx) +; BWON-F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; BWON-F16C-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; BWON-F16C-NEXT: addq $80, %rsp ; BWON-F16C-NEXT: popq %rbx -; BWON-F16C-NEXT: popq %r14 -; BWON-F16C-NEXT: popq %r15 -; BWON-F16C-NEXT: popq %rbp ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_trunc64_vec4: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: pushl %ebp -; CHECK-I686-NEXT: pushl %ebx -; CHECK-I686-NEXT: pushl %edi ; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $60, %esp +; CHECK-I686-NEXT: subl $88, %esp ; CHECK-I686-NEXT: movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp +; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-I686-NEXT: movlps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 -; CHECK-I686-NEXT: movw %ax, %si +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: movhps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 -; CHECK-I686-NEXT: movw %ax, %di +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: movlps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 -; CHECK-I686-NEXT: movw %ax, %bx +; CHECK-I686-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-I686-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-I686-NEXT: movhps %xmm0, (%esp) ; CHECK-I686-NEXT: calll __truncdfhf2 -; CHECK-I686-NEXT: movw %ax, 6(%ebp) -; CHECK-I686-NEXT: movw %bx, 4(%ebp) -; CHECK-I686-NEXT: movw %di, 2(%ebp) -; CHECK-I686-NEXT: movw %si, (%ebp) -; CHECK-I686-NEXT: addl $60, %esp +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 6(%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 4(%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, 2(%esi) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esi) +; CHECK-I686-NEXT: addl $88, %esp ; CHECK-I686-NEXT: popl %esi -; CHECK-I686-NEXT: popl %edi -; CHECK-I686-NEXT: popl %ebx -; CHECK-I686-NEXT: popl %ebp ; CHECK-I686-NEXT: retl %v = fptrunc <4 x double> %a to <4 x half> store <4 x half> %v, <4 x half>* %p @@ -817,7 +692,7 @@ ; CHECK-LIBCALL-NEXT: pushq %rax ; CHECK-LIBCALL-NEXT: callq test_floatret@PLT ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; CHECK-LIBCALL-NEXT: popq %rcx +; CHECK-LIBCALL-NEXT: popq %rax ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: test_f80trunc_nodagcombine: @@ -826,8 +701,8 @@ ; BWON-F16C-NEXT: callq test_floatret@PLT ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; BWON-F16C-NEXT: vmovd %xmm0, %eax -; BWON-F16C-NEXT: # kill: def $ax killed $ax killed $eax -; BWON-F16C-NEXT: popq %rcx +; BWON-F16C-NEXT: pinsrw $0, %eax, %xmm0 +; BWON-F16C-NEXT: popq %rax ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_f80trunc_nodagcombine: @@ -849,51 +724,76 @@ define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 { ; CHECK-LIBCALL-LABEL: test_sitofp_fadd_i32: ; CHECK-LIBCALL: # %bb.0: -; CHECK-LIBCALL-NEXT: pushq %rbx -; CHECK-LIBCALL-NEXT: subq $16, %rsp -; CHECK-LIBCALL-NEXT: movzwl (%rsi), %ebx -; CHECK-LIBCALL-NEXT: cvtsi2ss %edi, %xmm0 -; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; CHECK-LIBCALL-NEXT: movzwl %ax, %edi +; CHECK-LIBCALL-NEXT: subq $40, %rsp +; CHECK-LIBCALL-NEXT: pinsrw $0, (%rsi), %xmm0 +; CHECK-LIBCALL-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-LIBCALL-NEXT: xorl $-2147483648, %edi # imm = 0x80000000 +; CHECK-LIBCALL-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; CHECK-LIBCALL-NEXT: movl $1127219200, {{[0-9]+}}(%rsp) # imm = 0x43300000 +; CHECK-LIBCALL-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-LIBCALL-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-LIBCALL-NEXT: callq __truncdfhf2@PLT +; CHECK-LIBCALL-NEXT: movd %xmm0, %eax +; CHECK-LIBCALL-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-LIBCALL-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-LIBCALL-NEXT: movl %ebx, %edi +; CHECK-LIBCALL-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; CHECK-LIBCALL-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT ; CHECK-LIBCALL-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT -; CHECK-LIBCALL-NEXT: movzwl %ax, %edi -; CHECK-LIBCALL-NEXT: addq $16, %rsp -; CHECK-LIBCALL-NEXT: popq %rbx +; CHECK-LIBCALL-NEXT: addq $40, %rsp ; CHECK-LIBCALL-NEXT: jmp __gnu_h2f_ieee@PLT # TAILCALL ; ; BWON-F16C-LABEL: test_sitofp_fadd_i32: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rsi), %eax -; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: pushq %rbx +; BWON-F16C-NEXT: subq $16, %rsp +; BWON-F16C-NEXT: movzwl (%rsi), %ebx +; BWON-F16C-NEXT: xorl $-2147483648, %edi # imm = 0x80000000 +; BWON-F16C-NEXT: movl %edi, {{[0-9]+}}(%rsp) +; BWON-F16C-NEXT: movl $1127219200, {{[0-9]+}}(%rsp) # imm = 0x43300000 +; BWON-F16C-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; BWON-F16C-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BWON-F16C-NEXT: callq __truncdfhf2@PLT +; BWON-F16C-NEXT: pextrw $0, %xmm0, %eax +; BWON-F16C-NEXT: vmovd %ebx, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: movzwl %ax, %eax ; BWON-F16C-NEXT: vmovd %eax, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BWON-F16C-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: movzwl %ax, %eax +; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: addq $16, %rsp +; BWON-F16C-NEXT: popq %rbx ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_sitofp_fadd_i32: ; CHECK-I686: # %bb.0: -; CHECK-I686-NEXT: pushl %edi -; CHECK-I686-NEXT: pushl %esi -; CHECK-I686-NEXT: subl $20, %esp +; CHECK-I686-NEXT: subl $76, %esp ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movzwl (%eax), %edi -; CHECK-I686-NEXT: cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 -; CHECK-I686-NEXT: movss %xmm0, (%esp) -; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movw %ax, %si -; CHECK-I686-NEXT: movl %edi, (%esp) +; CHECK-I686-NEXT: pinsrw $0, (%eax), %xmm0 +; CHECK-I686-NEXT: movdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: movl $-2147483648, %eax # imm = 0x80000000 +; CHECK-I686-NEXT: xorl {{[0-9]+}}(%esp), %eax +; CHECK-I686-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-I686-NEXT: movl $1127219200, {{[0-9]+}}(%esp) # imm = 0x43300000 +; CHECK-I686-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-I686-NEXT: subsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; CHECK-I686-NEXT: movsd %xmm0, (%esp) +; CHECK-I686-NEXT: calll __truncdfhf2 +; CHECK-I686-NEXT: movapd %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: movzwl %si, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) @@ -901,12 +801,10 @@ ; CHECK-I686-NEXT: addss {{[0-9]+}}(%esp), %xmm0 ; CHECK-I686-NEXT: movss %xmm0, (%esp) ; CHECK-I686-NEXT: calll __gnu_f2h_ieee -; CHECK-I686-NEXT: movzwl %ax, %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee -; CHECK-I686-NEXT: addl $20, %esp -; CHECK-I686-NEXT: popl %esi -; CHECK-I686-NEXT: popl %edi +; CHECK-I686-NEXT: addl $76, %esp ; CHECK-I686-NEXT: retl %tmp0 = load half, half* %b %tmp1 = sitofp i32 %a to half @@ -919,47 +817,60 @@ ; CHECK-LIBCALL-LABEL: PR40273: ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: pushq %rax -; CHECK-LIBCALL-NEXT: movzwl %di, %edi ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee@PLT -; CHECK-LIBCALL-NEXT: xorl %eax, %eax ; CHECK-LIBCALL-NEXT: xorps %xmm1, %xmm1 ; CHECK-LIBCALL-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-LIBCALL-NEXT: movl $15360, %ecx # imm = 0x3C00 -; CHECK-LIBCALL-NEXT: cmovnel %ecx, %eax -; CHECK-LIBCALL-NEXT: cmovpl %ecx, %eax -; CHECK-LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-LIBCALL-NEXT: popq %rcx +; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-LIBCALL-NEXT: jne .LBB17_3 +; CHECK-LIBCALL-NEXT: # %bb.1: +; CHECK-LIBCALL-NEXT: jp .LBB17_3 +; CHECK-LIBCALL-NEXT: # %bb.2: +; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0 +; CHECK-LIBCALL-NEXT: .LBB17_3: +; CHECK-LIBCALL-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-LIBCALL-NEXT: popq %rax ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: PR40273: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl %di, %eax +; BWON-F16C-NEXT: pextrw $0, %xmm0, %eax +; BWON-F16C-NEXT: movzwl %ax, %eax ; BWON-F16C-NEXT: vmovd %eax, %xmm0 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: xorl %eax, %eax ; BWON-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; BWON-F16C-NEXT: vucomiss %xmm1, %xmm0 -; BWON-F16C-NEXT: movl $15360, %ecx # imm = 0x3C00 -; BWON-F16C-NEXT: cmovnel %ecx, %eax -; BWON-F16C-NEXT: cmovpl %ecx, %eax -; BWON-F16C-NEXT: # kill: def $ax killed $ax killed $eax +; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; BWON-F16C-NEXT: jne .LBB17_3 +; BWON-F16C-NEXT: # %bb.1: +; BWON-F16C-NEXT: jp .LBB17_3 +; BWON-F16C-NEXT: # %bb.2: +; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; BWON-F16C-NEXT: .LBB17_3: +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovd %xmm0, %eax +; BWON-F16C-NEXT: pinsrw $0, %eax, %xmm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: PR40273: ; CHECK-I686: # %bb.0: ; CHECK-I686-NEXT: subl $12, %esp -; CHECK-I686-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-I686-NEXT: movl %eax, (%esp) +; CHECK-I686-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax +; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __gnu_h2f_ieee ; CHECK-I686-NEXT: fstps {{[0-9]+}}(%esp) ; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-I686-NEXT: xorl %eax, %eax ; CHECK-I686-NEXT: xorps %xmm1, %xmm1 ; CHECK-I686-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-I686-NEXT: movl $15360, %ecx # imm = 0x3C00 -; CHECK-I686-NEXT: cmovnel %ecx, %eax -; CHECK-I686-NEXT: cmovpl %ecx, %eax -; CHECK-I686-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-I686-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-I686-NEXT: jne .LBB17_3 +; CHECK-I686-NEXT: # %bb.1: +; CHECK-I686-NEXT: jp .LBB17_3 +; CHECK-I686-NEXT: # %bb.2: +; CHECK-I686-NEXT: xorps %xmm0, %xmm0 +; CHECK-I686-NEXT: .LBB17_3: +; CHECK-I686-NEXT: movss %xmm0, (%esp) +; CHECK-I686-NEXT: calll __gnu_f2h_ieee ; CHECK-I686-NEXT: addl $12, %esp ; CHECK-I686-NEXT: retl %2 = fcmp une half %0, 0xH0000 diff --git a/llvm/test/CodeGen/X86/pr31088.ll b/llvm/test/CodeGen/X86/pr31088.ll --- a/llvm/test/CodeGen/X86/pr31088.ll +++ b/llvm/test/CodeGen/X86/pr31088.ll @@ -7,13 +7,16 @@ define <1 x half> @ir_fadd_v1f16(<1 x half> %arg0, <1 x half> %arg1) nounwind { ; X86-LABEL: ir_fadd_v1f16: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: subl $12, %esp -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: subl $28, %esp +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee -; X86-NEXT: movl %esi, (%esp) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: calll __gnu_h2f_ieee ; X86-NEXT: fstps {{[0-9]+}}(%esp) @@ -21,54 +24,60 @@ ; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movss %xmm0, (%esp) ; X86-NEXT: calll __gnu_f2h_ieee -; X86-NEXT: addl $12, %esp -; X86-NEXT: popl %esi +; X86-NEXT: addl $28, %esp ; X86-NEXT: retl ; ; X64-LABEL: ir_fadd_v1f16: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx -; X64-NEXT: subq $16, %rsp -; X64-NEXT: movl %edi, %ebx -; X64-NEXT: movzwl %si, %edi +; X64-NEXT: pushq %rax +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; X64-NEXT: movdqa %xmm1, %xmm0 ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movzwl %bx, %edi +; X64-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; X64-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __gnu_f2h_ieee@PLT -; X64-NEXT: addq $16, %rsp -; X64-NEXT: popq %rbx +; X64-NEXT: popq %rax ; X64-NEXT: retq ; ; F16C-LABEL: ir_fadd_v1f16: ; F16C: # %bb.0: -; F16C-NEXT: movzwl %si, %eax -; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: pextrw $0, %xmm0, %eax +; F16C-NEXT: pextrw $0, %xmm1, %ecx +; F16C-NEXT: movzwl %cx, %ecx +; F16C-NEXT: vmovd %ecx, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: movzwl %di, %eax +; F16C-NEXT: movzwl %ax, %eax ; F16C-NEXT: vmovd %eax, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-NEXT: pinsrw $0, %eax, %xmm0 ; F16C-NEXT: retq ; ; F16C-O0-LABEL: ir_fadd_v1f16: ; F16C-O0: # %bb.0: -; F16C-O0-NEXT: movw %si, %cx -; F16C-O0-NEXT: movw %di, %ax -; F16C-O0-NEXT: movzwl %cx, %ecx -; F16C-O0-NEXT: vmovd %ecx, %xmm0 -; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm1 +; F16C-O0-NEXT: pextrw $0, %xmm1, %eax +; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-O0-NEXT: movzwl %ax, %eax +; F16C-O0-NEXT: vmovd %eax, %xmm1 +; F16C-O0-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-O0-NEXT: pextrw $0, %xmm0, %eax +; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax ; F16C-O0-NEXT: movzwl %ax, %eax ; F16C-O0-NEXT: vmovd %eax, %xmm0 ; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; F16C-O0-NEXT: vmovd %xmm0, %eax -; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-O0-NEXT: movw %ax, %cx +; F16C-O0-NEXT: # implicit-def: $eax +; F16C-O0-NEXT: movw %cx, %ax +; F16C-O0-NEXT: # implicit-def: $xmm0 +; F16C-O0-NEXT: pinsrw $0, %eax, %xmm0 ; F16C-O0-NEXT: retq %retval = fadd <1 x half> %arg0, %arg1 ret <1 x half> %retval @@ -77,26 +86,30 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind { ; X86-LABEL: ir_fadd_v2f16: ; X86: # %bb.0: -; X86-NEXT: pushl %ebp -; X86-NEXT: movl %esp, %ebp -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $64, %esp -; X86-NEXT: movzwl 8(%ebp), %esi -; X86-NEXT: movzwl 12(%ebp), %edi -; X86-NEXT: movzwl 20(%ebp), %ebx -; X86-NEXT: movzwl 16(%ebp), %eax -; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: subl $80, %esp +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee ; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; X86-NEXT: movl %ebx, (%esp) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee ; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; X86-NEXT: movl %edi, (%esp) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: calll __gnu_h2f_ieee -; X86-NEXT: movl %esi, (%esp) +; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-NEXT: pextrw $0, %xmm0, %eax +; X86-NEXT: movw %ax, (%esp) ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; X86-NEXT: fstps {{[0-9]+}}(%esp) @@ -108,117 +121,113 @@ ; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: calll __gnu_f2h_ieee +; X86-NEXT: movd %xmm0, %eax +; X86-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill ; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: addss {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movss %xmm0, (%esp) -; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) ; X86-NEXT: calll __gnu_f2h_ieee -; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) -; X86-NEXT: movdqa {{[0-9]+}}(%esp), %xmm0 -; X86-NEXT: movd %xmm0, %eax -; X86-NEXT: pextrw $1, %xmm0, %edx -; X86-NEXT: # kill: def $ax killed $ax killed $eax -; X86-NEXT: # kill: def $dx killed $dx killed $edx -; X86-NEXT: leal -12(%ebp), %esp -; X86-NEXT: popl %esi -; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx -; X86-NEXT: popl %ebp +; X86-NEXT: movaps %xmm0, %xmm1 +; X86-NEXT: pinsrw $0, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 2-byte Folded Reload +; X86-NEXT: addl $80, %esp ; X86-NEXT: retl ; ; X64-LABEL: ir_fadd_v2f16: ; X64: # %bb.0: -; X64-NEXT: pushq %rbp -; X64-NEXT: pushq %r14 -; X64-NEXT: pushq %rbx -; X64-NEXT: subq $32, %rsp -; X64-NEXT: movl %edx, %ebx -; X64-NEXT: movl %esi, %ebp -; X64-NEXT: movl %edi, %r14d -; X64-NEXT: movzwl %cx, %edi +; X64-NEXT: subq $24, %rsp +; X64-NEXT: movd %xmm3, %eax +; X64-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; X64-NEXT: movd %xmm1, %eax +; X64-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; X64-NEXT: movdqa %xmm2, %xmm0 ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movzwl %bp, %edi +; X64-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; X64-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __gnu_f2h_ieee@PLT -; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; X64-NEXT: movzwl %bx, %edi +; X64-NEXT: movd %xmm0, %eax +; X64-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; X64-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload ; X64-NEXT: callq __gnu_h2f_ieee@PLT -; X64-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; X64-NEXT: movzwl %r14w, %edi +; X64-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; X64-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload ; X64-NEXT: callq __gnu_h2f_ieee@PLT ; X64-NEXT: addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; X64-NEXT: callq __gnu_f2h_ieee@PLT -; X64-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; X64-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: pextrw $1, %xmm0, %edx -; X64-NEXT: # kill: def $ax killed $ax killed $eax -; X64-NEXT: # kill: def $dx killed $dx killed $edx -; X64-NEXT: addq $32, %rsp -; X64-NEXT: popq %rbx -; X64-NEXT: popq %r14 -; X64-NEXT: popq %rbp +; X64-NEXT: movaps %xmm0, %xmm1 +; X64-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload +; X64-NEXT: addq $24, %rsp ; X64-NEXT: retq ; ; F16C-LABEL: ir_fadd_v2f16: ; F16C: # %bb.0: -; F16C-NEXT: movzwl %cx, %eax -; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: pextrw $0, %xmm1, %eax +; F16C-NEXT: pextrw $0, %xmm3, %ecx +; F16C-NEXT: pextrw $0, %xmm0, %edx +; F16C-NEXT: pextrw $0, %xmm2, %esi +; F16C-NEXT: movzwl %si, %esi +; F16C-NEXT: vmovd %esi, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: movzwl %si, %eax -; F16C-NEXT: vmovd %eax, %xmm1 +; F16C-NEXT: movzwl %dx, %edx +; F16C-NEXT: vmovd %edx, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; F16C-NEXT: movzwl %dx, %eax -; F16C-NEXT: vmovd %eax, %xmm0 -; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: movzwl %di, %eax -; F16C-NEXT: vmovd %eax, %xmm1 +; F16C-NEXT: vmovd %xmm0, %edx +; F16C-NEXT: pinsrw $0, %edx, %xmm0 +; F16C-NEXT: movzwl %cx, %ecx +; F16C-NEXT: vmovd %ecx, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; F16C-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: vpextrw $1, %xmm0, %edx -; F16C-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-NEXT: # kill: def $dx killed $dx killed $edx +; F16C-NEXT: movzwl %ax, %eax +; F16C-NEXT: vmovd %eax, %xmm2 +; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 +; F16C-NEXT: vaddss %xmm1, %xmm2, %xmm1 +; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; F16C-NEXT: vmovd %xmm1, %eax +; F16C-NEXT: pinsrw $0, %eax, %xmm1 ; F16C-NEXT: retq ; ; F16C-O0-LABEL: ir_fadd_v2f16: ; F16C-O0: # %bb.0: -; F16C-O0-NEXT: movl %esi, %eax -; F16C-O0-NEXT: # kill: def $cx killed $cx killed $ecx -; F16C-O0-NEXT: movw %dx, %si +; F16C-O0-NEXT: pextrw $0, %xmm2, %eax +; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-O0-NEXT: movzwl %ax, %eax +; F16C-O0-NEXT: vmovd %eax, %xmm2 +; F16C-O0-NEXT: vcvtph2ps %xmm2, %xmm2 +; F16C-O0-NEXT: pextrw $0, %xmm0, %eax ; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-O0-NEXT: movw %di, %dx -; F16C-O0-NEXT: movzwl %si, %esi -; F16C-O0-NEXT: vmovd %esi, %xmm0 -; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm1 -; F16C-O0-NEXT: movzwl %dx, %edx -; F16C-O0-NEXT: vmovd %edx, %xmm0 -; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-O0-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; F16C-O0-NEXT: movzwl %cx, %ecx -; F16C-O0-NEXT: vmovd %ecx, %xmm0 -; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm1 ; F16C-O0-NEXT: movzwl %ax, %eax ; F16C-O0-NEXT: vmovd %eax, %xmm0 ; F16C-O0-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-O0-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; F16C-O0-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; F16C-O0-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-O0-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; F16C-O0-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 ; F16C-O0-NEXT: vmovd %xmm0, %eax +; F16C-O0-NEXT: movw %ax, %cx +; F16C-O0-NEXT: # implicit-def: $eax +; F16C-O0-NEXT: movw %cx, %ax +; F16C-O0-NEXT: # implicit-def: $xmm0 +; F16C-O0-NEXT: pinsrw $0, %eax, %xmm0 +; F16C-O0-NEXT: pextrw $0, %xmm3, %eax ; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-O0-NEXT: vpextrw $1, %xmm0, %ecx -; F16C-O0-NEXT: movw %cx, %dx +; F16C-O0-NEXT: movzwl %ax, %eax +; F16C-O0-NEXT: vmovd %eax, %xmm2 +; F16C-O0-NEXT: vcvtph2ps %xmm2, %xmm2 +; F16C-O0-NEXT: pextrw $0, %xmm1, %eax +; F16C-O0-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-O0-NEXT: movzwl %ax, %eax +; F16C-O0-NEXT: vmovd %eax, %xmm1 +; F16C-O0-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-O0-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; F16C-O0-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; F16C-O0-NEXT: vmovd %xmm1, %eax +; F16C-O0-NEXT: movw %ax, %cx +; F16C-O0-NEXT: # implicit-def: $eax +; F16C-O0-NEXT: movw %cx, %ax +; F16C-O0-NEXT: # implicit-def: $xmm1 +; F16C-O0-NEXT: pinsrw $0, %eax, %xmm1 ; F16C-O0-NEXT: retq %retval = fadd <2 x half> %arg0, %arg1 ret <2 x half> %retval diff --git a/llvm/test/CodeGen/X86/pr38533.ll b/llvm/test/CodeGen/X86/pr38533.ll --- a/llvm/test/CodeGen/X86/pr38533.ll +++ b/llvm/test/CodeGen/X86/pr38533.ll @@ -1,23 +1,53 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=avx512f | FileCheck %s --check-prefix=AVX ; This test makes sure that a vector that needs to be promoted that is bitcasted to fp16 is legalized correctly without causing a width mismatch. define void @constant_fold_vector_to_half() { -; CHECK-LABEL: constant_fold_vector_to_half: -; CHECK: # %bb.0: -; CHECK-NEXT: movw $16384, (%rax) # imm = 0x4000 -; CHECK-NEXT: retq +; SSE-LABEL: constant_fold_vector_to_half: +; SSE: # %bb.0: +; SSE-NEXT: movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000 +; SSE-NEXT: pinsrw $0, -{{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, (%rax) +; SSE-NEXT: retq +; +; SSE41-LABEL: constant_fold_vector_to_half: +; SSE41: # %bb.0: +; SSE41-NEXT: movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000 +; SSE41-NEXT: pinsrw $0, -{{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: pextrw $0, %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX-LABEL: constant_fold_vector_to_half: +; AVX: # %bb.0: +; AVX-NEXT: movw $16384, -{{[0-9]+}}(%rsp) # imm = 0x4000 +; AVX-NEXT: pinsrw $0, -{{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vpextrw $0, %xmm0, (%rax) +; AVX-NEXT: retq store volatile half bitcast (<4 x i4> to half), half* undef ret void } ; Similarly this makes sure that the opposite bitcast of the above is also legalized without crashing. define void @pr38533_2(half %x) { -; CHECK-LABEL: pr38533_2: -; CHECK: # %bb.0: -; CHECK-NEXT: movw %di, (%rax) -; CHECK-NEXT: retq +; SSE-LABEL: pr38533_2: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, (%rax) +; SSE-NEXT: retq +; +; SSE41-LABEL: pr38533_2: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrw $0, %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX-LABEL: pr38533_2: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, (%rax) +; AVX-NEXT: retq %a = bitcast half %x to <4 x i4> store volatile <4 x i4> %a, <4 x i4>* undef ret void @@ -25,10 +55,21 @@ ; This case is a bitcast from fp16 to a 16-bit wide legal vector type. In this case the result type is legal when the bitcast gets type legalized. define void @pr38533_3(half %x) { -; CHECK-LABEL: pr38533_3: -; CHECK: # %bb.0: -; CHECK-NEXT: movw %di, (%rax) -; CHECK-NEXT: retq +; SSE-LABEL: pr38533_3: +; SSE: # %bb.0: +; SSE-NEXT: pextrw $0, %xmm0, %eax +; SSE-NEXT: movw %ax, (%rax) +; SSE-NEXT: retq +; +; SSE41-LABEL: pr38533_3: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrw $0, %xmm0, (%rax) +; SSE41-NEXT: retq +; +; AVX-LABEL: pr38533_3: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, (%rax) +; AVX-NEXT: retq %a = bitcast half %x to <16 x i1> store volatile <16 x i1> %a, <16 x i1>* undef ret void diff --git a/llvm/test/CodeGen/X86/pr47000.ll b/llvm/test/CodeGen/X86/pr47000.ll --- a/llvm/test/CodeGen/X86/pr47000.ll +++ b/llvm/test/CodeGen/X86/pr47000.ll @@ -1,141 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mcpu=pentium4 -O0 | FileCheck %s +; Per to ABI, half type is not supported without SSE2. +; RUN: not --crash llc < %s -mcpu=pentium4 -O0 target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386-unknown-linux-unknown" define <4 x half> @doTheTestMod(<4 x half> %0, <4 x half> %1) nounwind { -; CHECK-LABEL: doTheTestMod: -; CHECK: # %bb.0: # %Entry -; CHECK-NEXT: pushl %ebp -; CHECK-NEXT: pushl %ebx -; CHECK-NEXT: pushl %edi -; CHECK-NEXT: pushl %esi -; CHECK-NEXT: subl $124, %esp -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %si -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %dx -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %cx -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %ax -; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %di -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %bx -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %bp -; CHECK-NEXT: movw {{[0-9]+}}(%esp), %ax -; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %ax # 2-byte Reload -; CHECK-NEXT: movw %bp, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %di, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %si, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %dx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) -; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fxch %st(1) -; CHECK-NEXT: fstps 4(%eax) -; CHECK-NEXT: fstps (%eax) -; CHECK-NEXT: calll fmodf -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fstps (%eax) -; CHECK-NEXT: calll __gnu_f2h_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fxch %st(1) -; CHECK-NEXT: fstps 4(%eax) -; CHECK-NEXT: fstps (%eax) -; CHECK-NEXT: calll fmodf -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fstps (%eax) -; CHECK-NEXT: calll __gnu_f2h_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movw %ax, %si -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fxch %st(1) -; CHECK-NEXT: fstps 4(%eax) -; CHECK-NEXT: fstps (%eax) -; CHECK-NEXT: calll fmodf -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fstps (%eax) -; CHECK-NEXT: calll __gnu_f2h_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movw %ax, %di -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: calll __gnu_h2f_ieee -; CHECK-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fxch %st(1) -; CHECK-NEXT: fstps 4(%eax) -; CHECK-NEXT: fstps (%eax) -; CHECK-NEXT: calll fmodf -; CHECK-NEXT: movl %esp, %eax -; CHECK-NEXT: fstps (%eax) -; CHECK-NEXT: calll __gnu_f2h_ieee -; CHECK-NEXT: movw {{[-0-9]+}}(%e{{[sb]}}p), %dx # 2-byte Reload -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; CHECK-NEXT: movw %ax, %bx -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; CHECK-NEXT: movw %bx, 6(%ecx) -; CHECK-NEXT: movw %di, 4(%ecx) -; CHECK-NEXT: movw %si, 2(%ecx) -; CHECK-NEXT: movw %dx, (%ecx) -; CHECK-NEXT: addl $124, %esp -; CHECK-NEXT: popl %esi -; CHECK-NEXT: popl %edi -; CHECK-NEXT: popl %ebx -; CHECK-NEXT: popl %ebp -; CHECK-NEXT: retl $4 Entry: %x = alloca <4 x half>, align 8 %y = alloca <4 x half>, align 8 diff --git a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll --- a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll +++ b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll @@ -4,28 +4,30 @@ define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzwl 2(%rdi), %ecx -; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzwl 6(%rdi), %r8d -; CHECK-NEXT: movzwl 4(%rdi), %r11d -; CHECK-NEXT: movq (%rsi), %rsi -; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: pextrw $1, %xmm0, %r9d -; CHECK-NEXT: movd %xmm0, %r10d -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: pextrw $3, %xmm0, %eax -; CHECK-NEXT: pextrw $2, %xmm0, %edi -; CHECK-NEXT: movw %r11w, 8(%rdx) -; CHECK-NEXT: movw %cx, 4(%rdx) -; CHECK-NEXT: movw %r8w, 12(%rdx) -; CHECK-NEXT: movw %si, (%rdx) -; CHECK-NEXT: movw %di, 10(%rdx) +; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-NEXT: pinsrw $0, 2(%rdi), %xmm1 +; CHECK-NEXT: pinsrw $0, 4(%rdi), %xmm2 +; CHECK-NEXT: pinsrw $0, 6(%rdi), %xmm3 +; CHECK-NEXT: pinsrw $0, (%rsi), %xmm4 +; CHECK-NEXT: pinsrw $0, 2(%rsi), %xmm5 +; CHECK-NEXT: pinsrw $0, 4(%rsi), %xmm6 +; CHECK-NEXT: pinsrw $0, 6(%rsi), %xmm7 +; CHECK-NEXT: pextrw $0, %xmm7, %eax ; CHECK-NEXT: movw %ax, 14(%rdx) -; CHECK-NEXT: movw %r10w, 2(%rdx) -; CHECK-NEXT: movw %r9w, 6(%rdx) +; CHECK-NEXT: pextrw $0, %xmm3, %eax +; CHECK-NEXT: movw %ax, 12(%rdx) +; CHECK-NEXT: pextrw $0, %xmm6, %eax +; CHECK-NEXT: movw %ax, 10(%rdx) +; CHECK-NEXT: pextrw $0, %xmm2, %eax +; CHECK-NEXT: movw %ax, 8(%rdx) +; CHECK-NEXT: pextrw $0, %xmm5, %eax +; CHECK-NEXT: movw %ax, 6(%rdx) +; CHECK-NEXT: pextrw $0, %xmm1, %eax +; CHECK-NEXT: movw %ax, 4(%rdx) +; CHECK-NEXT: pextrw $0, %xmm4, %eax +; CHECK-NEXT: movw %ax, 2(%rdx) +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, (%rdx) ; CHECK-NEXT: retq %tmp4 = load <4 x half>, <4 x half>* %a %tmp5 = load <4 x half>, <4 x half>* %b diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir --- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir +++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir @@ -340,7 +340,7 @@ ; CHECK: CMP64rr [[NOT64r2]], [[COPY6]], implicit-def $eflags ; CHECK: undef %102.sub_32bit:gr64_with_sub_8bit = MOV32ri 0 ; CHECK: [[CMOV64rr:%[0-9]+]]:gr64 = CMOV64rr [[CMOV64rr]], %102, 4, implicit killed $eflags - ; CHECK: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4325385 /* reguse:GR64 */, %102, 4325385 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags + ; CHECK: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %102, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags ; CHECK: LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, [[COPY5]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic 4 on `i32 addrspace(1)* undef`, addrspace 1) ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK: $rdi = COPY [[COPY4]] @@ -456,7 +456,7 @@ %63:gr64 = NOT64r %63 CMP64rr %63, %31, implicit-def $eflags %63:gr64 = CMOV64rr %63, %53, 4, implicit killed $eflags - INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4325385 /* reguse:GR64 */, %53, 4325385 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags + INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4456457 /* reguse:GR64 */, %53, 4456457 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, %65, implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic 4 on `i32 addrspace(1)* undef`, addrspace 1) ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp $rdi = COPY %64 diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2153,61 +2153,38 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { ; SSE-LABEL: fptosi_2f16_to_4i32: ; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp ; SSE-NEXT: pushq %rbx -; SSE-NEXT: pushq %rax -; SSE-NEXT: movl %esi, %ebx -; SSE-NEXT: movzwl %di, %edi -; SSE-NEXT: callq __gnu_h2f_ieee@PLT -; SSE-NEXT: cvttss2si %xmm0, %ebp -; SSE-NEXT: movzwl %bx, %edi -; SSE-NEXT: callq __gnu_h2f_ieee@PLT -; SSE-NEXT: cvttss2si %xmm0, %eax +; SSE-NEXT: subq $16, %rsp +; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; SSE-NEXT: callq __fixhfsi@PLT +; SSE-NEXT: movl %eax, %ebx +; SSE-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload +; SSE-NEXT: callq __fixhfsi@PLT ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd %ebp, %xmm1 +; SSE-NEXT: movd %ebx, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: addq $16, %rsp ; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; -; VEX-LABEL: fptosi_2f16_to_4i32: -; VEX: # %bb.0: -; VEX-NEXT: pushq %rbp -; VEX-NEXT: pushq %rbx -; VEX-NEXT: pushq %rax -; VEX-NEXT: movl %esi, %ebx -; VEX-NEXT: movzwl %di, %edi -; VEX-NEXT: callq __gnu_h2f_ieee@PLT -; VEX-NEXT: vcvttss2si %xmm0, %ebp -; VEX-NEXT: movzwl %bx, %edi -; VEX-NEXT: callq __gnu_h2f_ieee@PLT -; VEX-NEXT: vcvttss2si %xmm0, %eax -; VEX-NEXT: vmovd %eax, %xmm0 -; VEX-NEXT: vmovd %ebp, %xmm1 -; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; VEX-NEXT: addq $8, %rsp -; VEX-NEXT: popq %rbx -; VEX-NEXT: popq %rbp -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_2f16_to_4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: movzwl %di, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvttss2si %xmm0, %eax -; AVX512-NEXT: movzwl %si, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvttss2si %xmm0, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: retq +; AVX-LABEL: fptosi_2f16_to_4i32: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $16, %rsp +; AVX-NEXT: vpextrw $0, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Folded Spill +; AVX-NEXT: callq __fixhfsi@PLT +; AVX-NEXT: movl %eax, %ebx +; AVX-NEXT: vpinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 2-byte Folded Reload +; AVX-NEXT: callq __fixhfsi@PLT +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vmovd %ebx, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: addq $16, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq %cvt = fptosi <2 x half> %a to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> ret <4 x i32> %ext diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -243,11 +243,8 @@ define double @cvt_i16_to_f64(i16 %a0) nounwind { ; ALL-LABEL: cvt_i16_to_f64: ; ALL: # %bb.0: -; ALL-NEXT: movzwl %di, %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: retq +; ALL-NEXT: pinsrw $0, %edi, %xmm0 +; ALL-NEXT: jmp __extendhfdf2@PLT # TAILCALL %1 = bitcast i16 %a0 to half %2 = fpext half %1 to double ret double %2 @@ -388,11 +385,8 @@ define double @load_cvt_i16_to_f64(i16* %a0) nounwind { ; ALL-LABEL: load_cvt_i16_to_f64: ; ALL: # %bb.0: -; ALL-NEXT: movzwl (%rdi), %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: retq +; ALL-NEXT: pinsrw $0, (%rdi), %xmm0 +; ALL-NEXT: jmp __extendhfdf2@PLT # TAILCALL %1 = load i16, i16* %a0 %2 = bitcast i16 %1 to half %3 = fpext half %2 to double @@ -557,7 +551,8 @@ ; ALL-LABEL: store_cvt_f32_to_i16: ; ALL: # %bb.0: ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vpextrw $0, %xmm0, (%rdi) +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: movw %ax, (%rdi) ; ALL-NEXT: retq %1 = fptrunc float %a0 to half %2 = bitcast half %1 to i16 @@ -647,7 +642,12 @@ define i16 @cvt_f64_to_i16(double %a0) nounwind { ; ALL-LABEL: cvt_f64_to_i16: ; ALL: # %bb.0: -; ALL-NEXT: jmp __truncdfhf2@PLT # TAILCALL +; ALL-NEXT: pushq %rax +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: pextrw $0, %xmm0, %eax +; ALL-NEXT: # kill: def $ax killed $ax killed $eax +; ALL-NEXT: popq %rcx +; ALL-NEXT: retq %1 = fptrunc double %a0 to half %2 = bitcast half %1 to i16 ret i16 %2 @@ -657,13 +657,13 @@ ; ALL-LABEL: cvt_2f64_to_2i16: ; ALL: # %bb.0: ; ALL-NEXT: subq $40, %rsp -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) +; ALL-NEXT: vpextrw $0, %xmm0, (%rsp) ; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; ALL-NEXT: # xmm0 = mem[1,0] ; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) +; ALL-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) ; ALL-NEXT: vmovaps (%rsp), %xmm0 ; ALL-NEXT: addq $40, %rsp ; ALL-NEXT: retq @@ -673,62 +673,166 @@ } define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_4i16: -; ALL: # %bb.0: -; ALL-NEXT: subq $72, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $72, %rsp -; ALL-NEXT: retq +; AVX1-LABEL: cvt_4f64_to_4i16: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $72, %rsp +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovaps (%rsp), %xmm0 +; AVX1-NEXT: addq $72, %rsp +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_4f64_to_4i16: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $72, %rsp +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovaps (%rsp), %xmm0 +; AVX2-NEXT: addq $72, %rsp +; AVX2-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $72, %rsp +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps (%rsp), %xmm0 +; AVX512-NEXT: addq $72, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> ret <4 x i16> %2 } define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_8i16_undef: -; ALL: # %bb.0: -; ALL-NEXT: subq $72, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: addq $72, %rsp -; ALL-NEXT: retq +; AVX1-LABEL: cvt_4f64_to_8i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $72, %rsp +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovaps (%rsp), %xmm0 +; AVX1-NEXT: addq $72, %rsp +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_4f64_to_8i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $72, %rsp +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovaps (%rsp), %xmm0 +; AVX2-NEXT: addq $72, %rsp +; AVX2-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $72, %rsp +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps (%rsp), %xmm0 +; AVX512-NEXT: addq $72, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> @@ -736,31 +840,83 @@ } define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_8i16_zero: -; ALL: # %bb.0: -; ALL-NEXT: subq $72, %rsp -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: addq $72, %rsp -; ALL-NEXT: retq +; AVX1-LABEL: cvt_4f64_to_8i16_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: subq $72, %rsp +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: addq $72, %rsp +; AVX1-NEXT: retq +; +; AVX2-LABEL: cvt_4f64_to_8i16_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: subq $72, %rsp +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: addq $72, %rsp +; AVX2-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_8i16_zero: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $72, %rsp +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: addq $72, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> @@ -770,7 +926,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { ; AVX1-LABEL: cvt_8f64_to_8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: pushq %rbp ; AVX1-NEXT: pushq %r14 ; AVX1-NEXT: pushq %rbx ; AVX1-NEXT: subq $64, %rsp @@ -779,65 +935,69 @@ ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebx +; AVX1-NEXT: pextrw $0, %xmm0, %ebx ; AVX1-NEXT: shll $16, %ebx ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movzwl %ax, %r15d -; AVX1-NEXT: orl %ebx, %r15d +; AVX1-NEXT: pextrw $0, %xmm0, %eax +; AVX1-NEXT: movzwl %ax, %r14d +; AVX1-NEXT: orl %ebx, %r14d ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: pextrw $0, %xmm0, %ebp +; AVX1-NEXT: shll $16, %ebp +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movzwl %ax, %r14d -; AVX1-NEXT: orl %ebx, %r14d -; AVX1-NEXT: shlq $32, %r14 -; AVX1-NEXT: orq %r15, %r14 +; AVX1-NEXT: pextrw $0, %xmm0, %eax +; AVX1-NEXT: movzwl %ax, %ebx +; AVX1-NEXT: orl %ebp, %ebx +; AVX1-NEXT: shlq $32, %rbx +; AVX1-NEXT: orq %r14, %rbx ; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[1,0] ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: shll $16, %ebx +; AVX1-NEXT: pextrw $0, %xmm0, %ebp +; AVX1-NEXT: shll $16, %ebp ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movzwl %ax, %r15d -; AVX1-NEXT: orl %ebx, %r15d +; AVX1-NEXT: pextrw $0, %xmm0, %eax +; AVX1-NEXT: movzwl %ax, %r14d +; AVX1-NEXT: orl %ebp, %r14d ; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebx -; AVX1-NEXT: shll $16, %ebx -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: pextrw $0, %xmm0, %ebp +; AVX1-NEXT: shll $16, %ebp +; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: pextrw $0, %xmm0, %eax ; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: orl %ebx, %eax +; AVX1-NEXT: orl %ebp, %eax ; AVX1-NEXT: shlq $32, %rax -; AVX1-NEXT: orq %r15, %rax +; AVX1-NEXT: orq %r14, %rax ; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vmovq %r14, %xmm1 +; AVX1-NEXT: vmovq %rbx, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: addq $64, %rsp ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: cvt_8f64_to_8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %r15 +; AVX2-NEXT: pushq %rbp ; AVX2-NEXT: pushq %r14 ; AVX2-NEXT: pushq %rbx ; AVX2-NEXT: subq $64, %rsp @@ -846,65 +1006,69 @@ ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebx +; AVX2-NEXT: pextrw $0, %xmm0, %ebx ; AVX2-NEXT: shll $16, %ebx ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movzwl %ax, %r15d -; AVX2-NEXT: orl %ebx, %r15d +; AVX2-NEXT: pextrw $0, %xmm0, %eax +; AVX2-NEXT: movzwl %ax, %r14d +; AVX2-NEXT: orl %ebx, %r14d ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: pextrw $0, %xmm0, %ebp +; AVX2-NEXT: shll $16, %ebp +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movzwl %ax, %r14d -; AVX2-NEXT: orl %ebx, %r14d -; AVX2-NEXT: shlq $32, %r14 -; AVX2-NEXT: orq %r15, %r14 +; AVX2-NEXT: pextrw $0, %xmm0, %eax +; AVX2-NEXT: movzwl %ax, %ebx +; AVX2-NEXT: orl %ebp, %ebx +; AVX2-NEXT: shlq $32, %rbx +; AVX2-NEXT: orq %r14, %rbx ; AVX2-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[1,0] ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: shll $16, %ebx +; AVX2-NEXT: pextrw $0, %xmm0, %ebp +; AVX2-NEXT: shll $16, %ebp ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movzwl %ax, %r15d -; AVX2-NEXT: orl %ebx, %r15d +; AVX2-NEXT: pextrw $0, %xmm0, %eax +; AVX2-NEXT: movzwl %ax, %r14d +; AVX2-NEXT: orl %ebp, %r14d ; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebx -; AVX2-NEXT: shll $16, %ebx -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: pextrw $0, %xmm0, %ebp +; AVX2-NEXT: shll $16, %ebp +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: pextrw $0, %xmm0, %eax ; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: orl %ebx, %eax +; AVX2-NEXT: orl %ebp, %eax ; AVX2-NEXT: shlq $32, %rax -; AVX2-NEXT: orq %r15, %rax +; AVX2-NEXT: orq %r14, %rax ; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vmovq %r14, %xmm1 +; AVX2-NEXT: vmovq %rbx, %xmm1 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX2-NEXT: addq $64, %rsp ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 +; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_8f64_to_8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %rbp ; AVX512-NEXT: pushq %r14 ; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: subq $80, %rsp @@ -912,63 +1076,67 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: pextrw $0, %xmm0, %ebx ; AVX512-NEXT: shll $16, %ebx ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movzwl %ax, %r15d -; AVX512-NEXT: orl %ebx, %r15d +; AVX512-NEXT: pextrw $0, %xmm0, %eax +; AVX512-NEXT: movzwl %ax, %r14d +; AVX512-NEXT: orl %ebx, %r14d ; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: pextrw $0, %xmm0, %ebp +; AVX512-NEXT: shll $16, %ebp +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movzwl %ax, %r14d -; AVX512-NEXT: orl %ebx, %r14d -; AVX512-NEXT: shlq $32, %r14 -; AVX512-NEXT: orq %r15, %r14 +; AVX512-NEXT: pextrw $0, %xmm0, %eax +; AVX512-NEXT: movzwl %ax, %ebx +; AVX512-NEXT: orl %ebp, %ebx +; AVX512-NEXT: shlq $32, %rbx +; AVX512-NEXT: orq %r14, %rbx ; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: shll $16, %ebx +; AVX512-NEXT: pextrw $0, %xmm0, %ebp +; AVX512-NEXT: shll $16, %ebp ; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movzwl %ax, %r15d -; AVX512-NEXT: orl %ebx, %r15d +; AVX512-NEXT: pextrw $0, %xmm0, %eax +; AVX512-NEXT: movzwl %ax, %r14d +; AVX512-NEXT: orl %ebp, %r14d ; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebx -; AVX512-NEXT: shll $16, %ebx -; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: pextrw $0, %xmm0, %ebp +; AVX512-NEXT: shll $16, %ebp +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: pextrw $0, %xmm0, %eax ; AVX512-NEXT: movzwl %ax, %eax -; AVX512-NEXT: orl %ebx, %eax +; AVX512-NEXT: orl %ebp, %eax ; AVX512-NEXT: shlq $32, %rax -; AVX512-NEXT: orq %r15, %rax +; AVX512-NEXT: orq %r14, %rax ; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vmovq %r14, %xmm1 +; AVX512-NEXT: vmovq %rbx, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512-NEXT: addq $80, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %1 = fptrunc <8 x double> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> @@ -985,7 +1153,7 @@ ; ALL-NEXT: pushq %rbx ; ALL-NEXT: movq %rdi, %rbx ; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rbx) +; ALL-NEXT: vpextrw $0, %xmm0, (%rbx) ; ALL-NEXT: popq %rbx ; ALL-NEXT: retq %1 = fptrunc double %a0 to half @@ -997,21 +1165,20 @@ define void @store_cvt_2f64_to_2i16(<2 x double> %a0, <2 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_2f64_to_2i16: ; ALL: # %bb.0: -; ALL-NEXT: pushq %rbp ; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $24, %rsp +; ALL-NEXT: subq $32, %rsp ; ALL-NEXT: movq %rdi, %rbx ; ALL-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; ALL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movl %eax, %ebp -; ALL-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; ALL-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rbx) -; ALL-NEXT: movw %bp, 2(%rbx) -; ALL-NEXT: addq $24, %rsp +; ALL-NEXT: vpextrw $0, %xmm0, (%rbx) +; ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; ALL-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; ALL-NEXT: addq $32, %rsp ; ALL-NEXT: popq %rbx -; ALL-NEXT: popq %rbp ; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> @@ -1022,116 +1189,107 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind { ; AVX1-LABEL: store_cvt_4f64_to_4i16: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $56, %rsp +; AVX1-NEXT: subq $80, %rsp ; AVX1-NEXT: movq %rdi, %rbx -; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r14d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movw %ax, 4(%rbx) -; AVX1-NEXT: movw %bp, (%rbx) -; AVX1-NEXT: movw %r15w, 6(%rbx) -; AVX1-NEXT: movw %r14w, 2(%rbx) -; AVX1-NEXT: addq $56, %rsp +; AVX1-NEXT: vpextrw $0, %xmm0, 4(%rbx) +; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, 6(%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX1-NEXT: addq $80, %rsp ; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_cvt_4f64_to_4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $56, %rsp +; AVX2-NEXT: subq $80, %rsp ; AVX2-NEXT: movq %rdi, %rbx -; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r14d -; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movw %ax, 4(%rbx) -; AVX2-NEXT: movw %bp, (%rbx) -; AVX2-NEXT: movw %r15w, 6(%rbx) -; AVX2-NEXT: movw %r14w, 2(%rbx) -; AVX2-NEXT: addq $56, %rsp +; AVX2-NEXT: vpextrw $0, %xmm0, 4(%rbx) +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, 6(%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX2-NEXT: addq $80, %rsp ; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_cvt_4f64_to_4i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: subq $80, %rsp ; AVX512-NEXT: movq %rdi, %rbx -; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r14d -; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movw %ax, 4(%rbx) -; AVX512-NEXT: movw %bp, (%rbx) -; AVX512-NEXT: movw %r15w, 6(%rbx) -; AVX512-NEXT: movw %r14w, 2(%rbx) -; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: vpextrw $0, %xmm0, 4(%rbx) +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpextrw $0, %xmm0, 6(%rbx) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX512-NEXT: addq $80, %rsp ; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -1140,35 +1298,95 @@ } define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) nounwind { -; ALL-LABEL: store_cvt_4f64_to_8i16_undef: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps (%rsp), %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rbx) -; ALL-NEXT: addq $64, %rsp -; ALL-NEXT: popq %rbx -; ALL-NEXT: retq +; AVX1-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovaps (%rsp), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, (%rbx) +; AVX1-NEXT: addq $64, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovaps (%rsp), %xmm0 +; AVX2-NEXT: vmovaps %xmm0, (%rbx) +; AVX2-NEXT: addq $64, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovaps (%rsp), %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rbx) +; AVX512-NEXT: addq $64, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> @@ -1177,35 +1395,95 @@ } define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounwind { -; ALL-LABEL: store_cvt_4f64_to_8i16_zero: -; ALL: # %bb.0: -; ALL-NEXT: pushq %rbx -; ALL-NEXT: subq $64, %rsp -; ALL-NEXT: movq %rdi, %rbx -; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, (%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; ALL-NEXT: # xmm0 = mem[1,0] -; ALL-NEXT: callq __truncdfhf2@PLT -; ALL-NEXT: movw %ax, {{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vmovaps %xmm0, (%rbx) -; ALL-NEXT: addq $64, %rsp -; ALL-NEXT: popq %rbx -; ALL-NEXT: retq +; AVX1-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $64, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX1-NEXT: # xmm0 = mem[1,0] +; AVX1-NEXT: callq __truncdfhf2@PLT +; AVX1-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vmovaps %xmm0, (%rbx) +; AVX1-NEXT: addq $64, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $64, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX2-NEXT: # xmm0 = mem[1,0] +; AVX2-NEXT: callq __truncdfhf2@PLT +; AVX2-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovaps %xmm0, (%rbx) +; AVX2-NEXT: addq $64, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq +; +; AVX512-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) +; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512-NEXT: vmovaps %xmm0, (%rbx) +; AVX512-NEXT: addq $64, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> @@ -1216,208 +1494,193 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind { ; AVX1-LABEL: store_cvt_8f64_to_8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r13 -; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $120, %rsp +; AVX1-NEXT: subq $160, %rsp ; AVX1-NEXT: movq %rdi, %rbx ; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = mem[1,0] ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %ebp -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movl %eax, %r15d -; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: callq __truncdfhf2@PLT -; AVX1-NEXT: movw %ax, 12(%rbx) -; AVX1-NEXT: movw %r15w, 8(%rbx) -; AVX1-NEXT: movw %r14w, 4(%rbx) -; AVX1-NEXT: movw %bp, (%rbx) -; AVX1-NEXT: movw %r13w, 14(%rbx) -; AVX1-NEXT: movw %r12w, 10(%rbx) -; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX1-NEXT: movw %ax, 6(%rbx) -; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX1-NEXT: movw %ax, 2(%rbx) -; AVX1-NEXT: addq $120, %rsp +; AVX1-NEXT: vpextrw $0, %xmm0, 12(%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, 8(%rbx) +; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, 4(%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, 14(%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, 10(%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, 6(%rbx) +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX1-NEXT: addq $160, %rsp ; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r13 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_cvt_8f64_to_8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r13 -; AVX2-NEXT: pushq %r12 ; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: subq $120, %rsp +; AVX2-NEXT: subq $160, %rsp ; AVX2-NEXT: movq %rdi, %rbx ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = mem[1,0] ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r12d +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r13d +; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %ebp -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r14d +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movl %eax, %r15d -; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-NEXT: callq __truncdfhf2@PLT -; AVX2-NEXT: movw %ax, 12(%rbx) -; AVX2-NEXT: movw %r15w, 8(%rbx) -; AVX2-NEXT: movw %r14w, 4(%rbx) -; AVX2-NEXT: movw %bp, (%rbx) -; AVX2-NEXT: movw %r13w, 14(%rbx) -; AVX2-NEXT: movw %r12w, 10(%rbx) -; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX2-NEXT: movw %ax, 6(%rbx) -; AVX2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX2-NEXT: movw %ax, 2(%rbx) -; AVX2-NEXT: addq $120, %rsp +; AVX2-NEXT: vpextrw $0, %xmm0, 12(%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, 8(%rbx) +; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, 4(%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, 14(%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, 10(%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, 6(%rbx) +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX2-NEXT: addq $160, %rsp ; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r13 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_cvt_8f64_to_8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r13 -; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $152, %rsp +; AVX512-NEXT: subq $192, %rsp ; AVX512-NEXT: movq %rdi, %rbx ; AVX512-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r12d +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r13d +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %ebp -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r14d +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movl %eax, %r15d -; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2@PLT -; AVX512-NEXT: movw %ax, 12(%rbx) -; AVX512-NEXT: movw %r15w, 8(%rbx) -; AVX512-NEXT: movw %r14w, 4(%rbx) -; AVX512-NEXT: movw %bp, (%rbx) -; AVX512-NEXT: movw %r13w, 14(%rbx) -; AVX512-NEXT: movw %r12w, 10(%rbx) -; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX512-NEXT: movw %ax, 6(%rbx) -; AVX512-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload -; AVX512-NEXT: movw %ax, 2(%rbx) -; AVX512-NEXT: addq $152, %rsp +; AVX512-NEXT: vpextrw $0, %xmm0, 12(%rbx) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpextrw $0, %xmm0, 8(%rbx) +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpextrw $0, %xmm0, 4(%rbx) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpextrw $0, %xmm0, 14(%rbx) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpextrw $0, %xmm0, 10(%rbx) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpextrw $0, %xmm0, 6(%rbx) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpextrw $0, %xmm0, 2(%rbx) +; AVX512-NEXT: addq $192, %rsp ; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r13 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp ; AVX512-NEXT: retq %1 = fptrunc <8 x double> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll @@ -364,72 +364,84 @@ } define half @test_v2f16(<2 x half> %a0) nounwind { -; SSE-LABEL: test_v2f16: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $16, %rsp -; SSE-NEXT: movl %esi, %ebx -; SSE-NEXT: movl %edi, %r14d -; SSE-NEXT: movzwl %bx, %ebp -; SSE-NEXT: movl %ebp, %edi -; SSE-NEXT: callq __gnu_h2f_ieee@PLT -; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: movzwl %r14w, %edi -; SSE-NEXT: callq __gnu_h2f_ieee@PLT -; SSE-NEXT: ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE-NEXT: movw %bp, {{[0-9]+}}(%rsp) -; SSE-NEXT: cmoval %r14d, %ebx -; SSE-NEXT: movw %bx, (%rsp) -; SSE-NEXT: movl (%rsp), %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: addq $16, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq +; SSE2-LABEL: test_v2f16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: callq __gnu_h2f_ieee@PLT +; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload +; SSE2-NEXT: callq __gnu_h2f_ieee@PLT +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: ucomiss %xmm1, %xmm0 +; SSE2-NEXT: ja .LBB10_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: .LBB10_2: +; SSE2-NEXT: callq __gnu_f2h_ieee@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2f16: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rax +; SSE41-NEXT: pextrw $0, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Folded Spill +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: callq __gnu_h2f_ieee@PLT +; SSE41-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE41-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload +; SSE41-NEXT: callq __gnu_h2f_ieee@PLT +; SSE41-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE41-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: ucomiss %xmm1, %xmm0 +; SSE41-NEXT: ja .LBB10_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: .LBB10_2: +; SSE41-NEXT: callq __gnu_f2h_ieee@PLT +; SSE41-NEXT: popq %rax +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f16: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $16, %rsp -; AVX-NEXT: movl %esi, %ebx -; AVX-NEXT: movl %edi, %r14d -; AVX-NEXT: movzwl %bx, %ebp -; AVX-NEXT: movl %ebp, %edi +; AVX-NEXT: pushq %rax +; AVX-NEXT: vpextrw $0, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Folded Spill +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: callq __gnu_h2f_ieee@PLT -; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: movzwl %r14w, %edi +; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX-NEXT: vpinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 2-byte Folded Reload ; AVX-NEXT: callq __gnu_h2f_ieee@PLT -; AVX-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; AVX-NEXT: movw %bp, {{[0-9]+}}(%rsp) -; AVX-NEXT: cmoval %r14d, %ebx -; AVX-NEXT: movw %bx, (%rsp) -; AVX-NEXT: movl (%rsp), %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: addq $16, %rsp -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %rbp +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vucomiss %xmm1, %xmm0 +; AVX-NEXT: ja .LBB10_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: .LBB10_2: +; AVX-NEXT: callq __gnu_f2h_ieee@PLT +; AVX-NEXT: popq %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f16: ; AVX512: # %bb.0: -; AVX512-NEXT: movzwl %si, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: pextrw $0, %xmm0, %eax +; AVX512-NEXT: pextrw $0, %xmm1, %ecx +; AVX512-NEXT: movzwl %cx, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: movzwl %di, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm1 +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512-NEXT: vucomiss %xmm0, %xmm1 -; AVX512-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: cmoval %edi, %esi -; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: seta %al +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: pinsrw $0, %eax, %xmm0 ; AVX512-NEXT: retq %1 = call nnan half @llvm.vector.reduce.fmax.v2f16(<2 x half> %a0) ret half %1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll @@ -363,72 +363,84 @@ } define half @test_v2f16(<2 x half> %a0) nounwind { -; SSE-LABEL: test_v2f16: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: subq $16, %rsp -; SSE-NEXT: movl %esi, %ebx -; SSE-NEXT: movl %edi, %r14d -; SSE-NEXT: movzwl %bx, %ebp -; SSE-NEXT: movl %ebp, %edi -; SSE-NEXT: callq __gnu_h2f_ieee@PLT -; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: movzwl %r14w, %edi -; SSE-NEXT: callq __gnu_h2f_ieee@PLT -; SSE-NEXT: ucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE-NEXT: movw %bp, {{[0-9]+}}(%rsp) -; SSE-NEXT: cmovbl %r14d, %ebx -; SSE-NEXT: movw %bx, (%rsp) -; SSE-NEXT: movl (%rsp), %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: addq $16, %rsp -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq +; SSE2-LABEL: test_v2f16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: callq __gnu_h2f_ieee@PLT +; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload +; SSE2-NEXT: callq __gnu_h2f_ieee@PLT +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: ucomiss %xmm1, %xmm0 +; SSE2-NEXT: jb .LBB10_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: .LBB10_2: +; SSE2-NEXT: callq __gnu_f2h_ieee@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v2f16: +; SSE41: # %bb.0: +; SSE41-NEXT: pushq %rax +; SSE41-NEXT: pextrw $0, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Folded Spill +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: callq __gnu_h2f_ieee@PLT +; SSE41-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE41-NEXT: pinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 2-byte Folded Reload +; SSE41-NEXT: callq __gnu_h2f_ieee@PLT +; SSE41-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE41-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: ucomiss %xmm1, %xmm0 +; SSE41-NEXT: jb .LBB10_2 +; SSE41-NEXT: # %bb.1: +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: .LBB10_2: +; SSE41-NEXT: callq __gnu_f2h_ieee@PLT +; SSE41-NEXT: popq %rax +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v2f16: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $16, %rsp -; AVX-NEXT: movl %esi, %ebx -; AVX-NEXT: movl %edi, %r14d -; AVX-NEXT: movzwl %bx, %ebp -; AVX-NEXT: movl %ebp, %edi +; AVX-NEXT: pushq %rax +; AVX-NEXT: vpextrw $0, %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Folded Spill +; AVX-NEXT: vmovdqa %xmm1, %xmm0 ; AVX-NEXT: callq __gnu_h2f_ieee@PLT -; AVX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX-NEXT: movzwl %r14w, %edi +; AVX-NEXT: vmovd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; AVX-NEXT: vpinsrw $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 2-byte Folded Reload ; AVX-NEXT: callq __gnu_h2f_ieee@PLT -; AVX-NEXT: vucomiss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; AVX-NEXT: movw %bp, {{[0-9]+}}(%rsp) -; AVX-NEXT: cmovbl %r14d, %ebx -; AVX-NEXT: movw %bx, (%rsp) -; AVX-NEXT: movl (%rsp), %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: addq $16, %rsp -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %rbp +; AVX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; AVX-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vucomiss %xmm1, %xmm0 +; AVX-NEXT: jb .LBB10_2 +; AVX-NEXT: # %bb.1: +; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: .LBB10_2: +; AVX-NEXT: callq __gnu_f2h_ieee@PLT +; AVX-NEXT: popq %rax ; AVX-NEXT: retq ; ; AVX512-LABEL: test_v2f16: ; AVX512: # %bb.0: -; AVX512-NEXT: movzwl %si, %eax -; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: pextrw $0, %xmm0, %eax +; AVX512-NEXT: pextrw $0, %xmm1, %ecx +; AVX512-NEXT: movzwl %cx, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm0 ; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: movzwl %di, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm1 +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %eax, %xmm1 ; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512-NEXT: vucomiss %xmm0, %xmm1 -; AVX512-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: cmovbl %edi, %esi -; AVX512-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; AVX512-NEXT: movl -{{[0-9]+}}(%rsp), %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: setb %al +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: pinsrw $0, %eax, %xmm0 ; AVX512-NEXT: retq %1 = call nnan half @llvm.vector.reduce.fmin.v2f16(<2 x half> %a0) ret half %1 diff --git a/llvm/test/MC/X86/x86_64-asm-match.s b/llvm/test/MC/X86/x86_64-asm-match.s --- a/llvm/test/MC/X86/x86_64-asm-match.s +++ b/llvm/test/MC/X86/x86_64-asm-match.s @@ -5,16 +5,16 @@ // CHECK: Trying to match opcode MMX_PSHUFBrr // CHECK: Matching formal operand class MCK_VR64 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode // CHECK: Trying to match opcode PSHUFBrr -// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode +// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): Opcode result: multiple operand mismatches, ignoring this opcode // CHECK: Trying to match opcode PSHUFBrm // CHECK: Matching formal operand class MCK_Mem128 against actual operand at index 1 (Memory: ModeSize=64,BaseReg=rip,Scale=1,Disp=CPI1_0): match success using generic matcher -// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 2 (Reg:xmm1): match success using generic matcher +// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 2 (Reg:xmm1): match success using generic matcher // CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 3: actual operand index out of range Opcode result: complete match, selecting this opcode // CHECK: AsmMatcher: found 2 encodings with mnemonic 'sha1rnds4' // CHECK: Trying to match opcode SHA1RNDS4rri // CHECK: Matching formal operand class MCK_ImmUnsignedi8 against actual operand at index 1 (Imm:1): match success using generic matcher -// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 2 (Reg:xmm1): match success using generic matcher -// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 3 (Reg:xmm2): match success using generic matcher +// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 2 (Reg:xmm1): match success using generic matcher +// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 3 (Reg:xmm2): match success using generic matcher // CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode // CHECK: AsmMatcher: found 4 encodings with mnemonic 'pinsrw' // CHECK: Trying to match opcode MMX_PINSRWrr @@ -24,7 +24,7 @@ // CHECK: Trying to match opcode PINSRWrr // CHECK: Matching formal operand class MCK_ImmUnsignedi8 against actual operand at index 1 (Imm:3): match success using generic matcher // CHECK: Matching formal operand class MCK_GR32orGR64 against actual operand at index 2 (Reg:ecx): match success using generic matcher -// CHECK: Matching formal operand class MCK_FR32 against actual operand at index 3 (Reg:xmm5): match success using generic matcher +// CHECK: Matching formal operand class MCK_FR16 against actual operand at index 3 (Reg:xmm5): match success using generic matcher // CHECK: Matching formal operand class InvalidMatchClass against actual operand at index 4: actual operand index out of range Opcode result: complete match, selecting this opcode // CHECK: AsmMatcher: found 2 encodings with mnemonic 'crc32l' // CHECK: Trying to match opcode CRC32r32r32 diff --git a/llvm/test/MachineVerifier/test_copy_physregs_x86.mir b/llvm/test/MachineVerifier/test_copy_physregs_x86.mir --- a/llvm/test/MachineVerifier/test_copy_physregs_x86.mir +++ b/llvm/test/MachineVerifier/test_copy_physregs_x86.mir @@ -28,7 +28,7 @@ bb.0: liveins: $xmm0, $xmm1, $xmm2, $xmm3 - ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes *** + ; FP16 reg is sub_reg of xmm %0:_(s16) = COPY $xmm0 ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes *** @@ -40,7 +40,7 @@ ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes *** %3:_(<8 x s32>) = COPY $xmm3 - ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes *** + ; FP16 reg is sub_reg of xmm $xmm0 = COPY %0 ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***