diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1755,8 +1755,6 @@ bool needsCmpXchgNb(Type *MemType) const; - template bool isSoftFP16(T VT) const; - void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1613,7 +1613,7 @@ setOperationAction(ISD::FP_ROUND, VT, Custom); setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); } - for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) { + for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) { setOperationAction(ISD::FP_EXTEND, VT, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); } @@ -1621,9 +1621,6 @@ setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32); setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); } - - setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); } // This block controls legalization of the mask vector sizes that are @@ -1940,8 +1937,8 @@ setF16Action(MVT::v32f16, Expand); setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Custom); for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32); @@ -2162,9 +2159,9 @@ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); @@ -2214,9 +2211,9 @@ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE @@ -11914,13 +11911,9 @@ } template -static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) { - return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16(); -} - -template -bool X86TargetLowering::isSoftFP16(T VT) const { - return ::isSoftFP16(VT, Subtarget); +static bool isSoftF16(T VT, const X86Subtarget &Subtarget) { + T EltVT = VT.getScalarType(); + return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16()); } /// Try to lower insertion of a single element into a zero vector. @@ -11936,7 +11929,7 @@ unsigned NumElts = VT.getVectorNumElements(); unsigned EltBits = VT.getScalarSizeInBits(); - if (isSoftFP16(EltVT, Subtarget)) + if (isSoftF16(EltVT, Subtarget)) return SDValue(); int V2Index = @@ -17491,7 +17484,7 @@ SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - if (isSoftFP16(VT)) { + if (isSoftF16(VT, Subtarget)) { MVT NVT = VT.changeVectorElementTypeToInteger(); return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond, DAG.getBitcast(NVT, LHS), @@ -19019,7 +19012,7 @@ MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - if (isSoftFP16(VT)) + if (isSoftF16(VT, Subtarget)) return promoteXINT_TO_FP(Op, DAG); else if (isLegalConversion(SrcVT, true, Subtarget)) return Op; @@ -19524,7 +19517,7 @@ if (DstVT == MVT::f128) return SDValue(); - if (isSoftFP16(DstVT)) + if (isSoftF16(DstVT, Subtarget)) return promoteXINT_TO_FP(Op, DAG); else if (isLegalConversion(SrcVT, false, Subtarget)) return Op; @@ -20543,7 +20536,7 @@ SDLoc dl(Op); SDValue Res; - if (isSoftFP16(SrcVT)) { + if (isSoftF16(SrcVT, Subtarget)) { MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; if (IsStrict) return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, @@ -20972,7 +20965,7 @@ // This code is only for floats and doubles. Fall back to generic code for // anything else. - if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT)) + if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget)) return SDValue(); EVT SatVT = cast(Node->getOperand(1))->getVT(); @@ -21117,6 +21110,10 @@ !Subtarget.getTargetTriple().isOSDarwin())) return SDValue(); + if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) || + (SVT == MVT::v16f16 && Subtarget.useAVX512Regs())) + return Op; + if (SVT == MVT::f16) { if (Subtarget.hasFP16()) return Op; @@ -21189,7 +21186,32 @@ if (!SVT.isVector()) return Op; + if (SVT.getVectorElementType() == MVT::bf16) { + // FIXME: Do we need to support strict FP? + assert(!IsStrict && "Strict FP doesn't support BF16"); + if (VT.getVectorElementType() == MVT::f64) { + MVT TmpVT = VT.changeVectorElementType(MVT::f32); + return DAG.getNode(ISD::FP_EXTEND, DL, VT, + DAG.getNode(ISD::FP_EXTEND, DL, TmpVT, In)); + } + MVT IntVT = SVT.changeTypeToInteger(); + In = DAG.getBitcast(IntVT, In); + int NumElts = VT.getVectorNumElements(); + SmallVector Mask; + for (int I = 0; I != NumElts; ++I) { + Mask.push_back(NumElts); + Mask.push_back(I); + } + MVT NVT = MVT::getVectorVT(MVT::i16, NumElts * 2); + In = DAG.getNode(ISD::CONCAT_VECTORS, DL, NVT, In, + DAG.getConstant(0, DL, IntVT)); + In = DAG.getVectorShuffle(NVT, DL, In, DAG.getUNDEF(NVT), Mask); + return DAG.getBitcast(VT, In); + } + if (SVT.getVectorElementType() == MVT::f16) { + if (Subtarget.hasFP16() && isTypeLegal(SVT)) + return Op; assert(Subtarget.hasF16C() && "Unexpected features!"); if (SVT == MVT::v2f16) In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In, @@ -22910,7 +22932,7 @@ if (isFP) { MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64); - if (isSoftFP16(EltVT, Subtarget)) + if (isSoftF16(EltVT, Subtarget)) return SDValue(); bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; @@ -23475,7 +23497,7 @@ ISD::CondCode CC = cast(Op.getOperand(IsStrict ? 3 : 2))->get(); - if (isSoftFP16(Op0.getValueType())) + if (isSoftF16(Op0.getValueType(), Subtarget)) return SDValue(); // Handle f128 first, since one possible outcome is a normal integer @@ -23668,7 +23690,7 @@ MVT VT = Op1.getSimpleValueType(); SDValue CC; - if (isSoftFP16(VT)) { + if (isSoftF16(VT, Subtarget)) { MVT NVT = VT.changeTypeToInteger(); return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond, DAG.getBitcast(NVT, Op1), @@ -23740,7 +23762,7 @@ } if (Cond.getOpcode() == ISD::SETCC && - !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) { + !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) { if (SDValue NewCond = LowerSETCC(Cond, DAG)) { Cond = NewCond; // If the condition was updated, it's possible that the operands of the @@ -24430,7 +24452,7 @@ // Bail out when we don't have native compare instructions. if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0).getValueType() != MVT::f128 && - !isSoftFP16(Cond.getOperand(0).getValueType())) { + !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) { SDValue LHS = Cond.getOperand(0); SDValue RHS = Cond.getOperand(1); ISD::CondCode CC = cast(Cond.getOperand(2))->get(); @@ -32231,7 +32253,7 @@ EVT SrcVT = Src.getValueType(); SDValue Res; - if (isSoftFP16(SrcVT)) { + if (isSoftF16(SrcVT, Subtarget)) { EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; if (IsStrict) { Res = @@ -44636,7 +44658,7 @@ // ignored in unsafe-math mode). // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) && + VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget.hasSSE2() || (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) { @@ -44953,7 +44975,7 @@ } // Early exit check - if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget)) + if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget)) return SDValue(); if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) @@ -51712,7 +51734,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); - if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget)) + if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget)) return SDValue(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2 -; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16 | FileCheck %s --check-prefixes=CHECK,BF16 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,F16,BF16 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,F16,FP16 define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind { ; SSE2-LABEL: add: @@ -20,22 +21,22 @@ ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq ; -; BF16-LABEL: add: -; BF16: # %bb.0: -; BF16-NEXT: pushq %rbx -; BF16-NEXT: movq %rdx, %rbx -; BF16-NEXT: movzwl (%rsi), %eax -; BF16-NEXT: shll $16, %eax -; BF16-NEXT: vmovd %eax, %xmm0 -; BF16-NEXT: movzwl (%rdi), %eax -; BF16-NEXT: shll $16, %eax -; BF16-NEXT: vmovd %eax, %xmm1 -; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %eax -; BF16-NEXT: movw %ax, (%rbx) -; BF16-NEXT: popq %rbx -; BF16-NEXT: retq +; F16-LABEL: add: +; F16: # %bb.0: +; F16-NEXT: pushq %rbx +; F16-NEXT: movq %rdx, %rbx +; F16-NEXT: movzwl (%rsi), %eax +; F16-NEXT: shll $16, %eax +; F16-NEXT: vmovd %eax, %xmm0 +; F16-NEXT: movzwl (%rdi), %eax +; F16-NEXT: shll $16, %eax +; F16-NEXT: vmovd %eax, %xmm1 +; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; F16-NEXT: callq __truncsfbf2@PLT +; F16-NEXT: vmovd %xmm0, %eax +; F16-NEXT: movw %ax, (%rbx) +; F16-NEXT: popq %rbx +; F16-NEXT: retq %a = load bfloat, ptr %pa %b = load bfloat, ptr %pb %add = fadd bfloat %a, %b @@ -58,19 +59,19 @@ ; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; -; BF16-LABEL: add2: -; BF16: # %bb.0: -; BF16-NEXT: pushq %rax -; BF16-NEXT: vmovd %xmm0, %eax -; BF16-NEXT: vmovd %xmm1, %ecx -; BF16-NEXT: shll $16, %ecx -; BF16-NEXT: vmovd %ecx, %xmm0 -; BF16-NEXT: shll $16, %eax -; BF16-NEXT: vmovd %eax, %xmm1 -; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: popq %rax -; BF16-NEXT: retq +; F16-LABEL: add2: +; F16: # %bb.0: +; F16-NEXT: pushq %rax +; F16-NEXT: vmovd %xmm0, %eax +; F16-NEXT: vmovd %xmm1, %ecx +; F16-NEXT: shll $16, %ecx +; F16-NEXT: vmovd %ecx, %xmm0 +; F16-NEXT: shll $16, %eax +; F16-NEXT: vmovd %eax, %xmm1 +; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; F16-NEXT: callq __truncsfbf2@PLT +; F16-NEXT: popq %rax +; F16-NEXT: retq %add = fadd bfloat %a, %b ret bfloat %add } @@ -105,34 +106,34 @@ ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; BF16-LABEL: add_double: -; BF16: # %bb.0: -; BF16-NEXT: pushq %rbp -; BF16-NEXT: pushq %r14 -; BF16-NEXT: pushq %rbx -; BF16-NEXT: movq %rdx, %rbx -; BF16-NEXT: movq %rsi, %r14 -; BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; BF16-NEXT: callq __truncdfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %ebp -; BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; BF16-NEXT: callq __truncdfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %eax -; BF16-NEXT: shll $16, %eax -; BF16-NEXT: vmovd %eax, %xmm0 -; BF16-NEXT: shll $16, %ebp -; BF16-NEXT: vmovd %ebp, %xmm1 -; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %eax -; BF16-NEXT: shll $16, %eax -; BF16-NEXT: vmovd %eax, %xmm0 -; BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; BF16-NEXT: vmovsd %xmm0, (%rbx) -; BF16-NEXT: popq %rbx -; BF16-NEXT: popq %r14 -; BF16-NEXT: popq %rbp -; BF16-NEXT: retq +; F16-LABEL: add_double: +; F16: # %bb.0: +; F16-NEXT: pushq %rbp +; F16-NEXT: pushq %r14 +; F16-NEXT: pushq %rbx +; F16-NEXT: movq %rdx, %rbx +; F16-NEXT: movq %rsi, %r14 +; F16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; F16-NEXT: callq __truncdfbf2@PLT +; F16-NEXT: vmovd %xmm0, %ebp +; F16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; F16-NEXT: callq __truncdfbf2@PLT +; F16-NEXT: vmovd %xmm0, %eax +; F16-NEXT: shll $16, %eax +; F16-NEXT: vmovd %eax, %xmm0 +; F16-NEXT: shll $16, %ebp +; F16-NEXT: vmovd %ebp, %xmm1 +; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; F16-NEXT: callq __truncsfbf2@PLT +; F16-NEXT: vmovd %xmm0, %eax +; F16-NEXT: shll $16, %eax +; F16-NEXT: vmovd %eax, %xmm0 +; F16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; F16-NEXT: vmovsd %xmm0, (%rbx) +; F16-NEXT: popq %rbx +; F16-NEXT: popq %r14 +; F16-NEXT: popq %rbp +; F16-NEXT: retq %la = load double, ptr %pa %a = fptrunc double %la to bfloat %lb = load double, ptr %pb @@ -169,30 +170,30 @@ ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq ; -; BF16-LABEL: add_double2: -; BF16: # %bb.0: -; BF16-NEXT: pushq %rbx -; BF16-NEXT: subq $16, %rsp -; BF16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; BF16-NEXT: callq __truncdfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %ebx -; BF16-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload -; BF16-NEXT: # xmm0 = mem[0],zero -; BF16-NEXT: callq __truncdfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %eax -; BF16-NEXT: shll $16, %eax -; BF16-NEXT: vmovd %eax, %xmm0 -; BF16-NEXT: shll $16, %ebx -; BF16-NEXT: vmovd %ebx, %xmm1 -; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %eax -; BF16-NEXT: shll $16, %eax -; BF16-NEXT: vmovd %eax, %xmm0 -; BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; BF16-NEXT: addq $16, %rsp -; BF16-NEXT: popq %rbx -; BF16-NEXT: retq +; F16-LABEL: add_double2: +; F16: # %bb.0: +; F16-NEXT: pushq %rbx +; F16-NEXT: subq $16, %rsp +; F16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; F16-NEXT: callq __truncdfbf2@PLT +; F16-NEXT: vmovd %xmm0, %ebx +; F16-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload +; F16-NEXT: # xmm0 = mem[0],zero +; F16-NEXT: callq __truncdfbf2@PLT +; F16-NEXT: vmovd %xmm0, %eax +; F16-NEXT: shll $16, %eax +; F16-NEXT: vmovd %eax, %xmm0 +; F16-NEXT: shll $16, %ebx +; F16-NEXT: vmovd %ebx, %xmm1 +; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; F16-NEXT: callq __truncsfbf2@PLT +; F16-NEXT: vmovd %xmm0, %eax +; F16-NEXT: shll $16, %eax +; F16-NEXT: vmovd %eax, %xmm0 +; F16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; F16-NEXT: addq $16, %rsp +; F16-NEXT: popq %rbx +; F16-NEXT: retq %a = fptrunc double %da to bfloat %b = fptrunc double %db to bfloat %add = fadd bfloat %a, %b @@ -215,19 +216,19 @@ ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq ; -; BF16-LABEL: add_constant: -; BF16: # %bb.0: -; BF16-NEXT: pushq %rbx -; BF16-NEXT: movq %rsi, %rbx -; BF16-NEXT: movzwl (%rdi), %eax -; BF16-NEXT: shll $16, %eax -; BF16-NEXT: vmovd %eax, %xmm0 -; BF16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %eax -; BF16-NEXT: movw %ax, (%rbx) -; BF16-NEXT: popq %rbx -; BF16-NEXT: retq +; F16-LABEL: add_constant: +; F16: # %bb.0: +; F16-NEXT: pushq %rbx +; F16-NEXT: movq %rsi, %rbx +; F16-NEXT: movzwl (%rdi), %eax +; F16-NEXT: shll $16, %eax +; F16-NEXT: vmovd %eax, %xmm0 +; F16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; F16-NEXT: callq __truncsfbf2@PLT +; F16-NEXT: vmovd %xmm0, %eax +; F16-NEXT: movw %ax, (%rbx) +; F16-NEXT: popq %rbx +; F16-NEXT: retq %a = load bfloat, ptr %pa %add = fadd bfloat %a, 1.0 store bfloat %add, ptr %pc @@ -246,16 +247,16 @@ ; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; -; BF16-LABEL: add_constant2: -; BF16: # %bb.0: -; BF16-NEXT: pushq %rax -; BF16-NEXT: vmovd %xmm0, %eax -; BF16-NEXT: shll $16, %eax -; BF16-NEXT: vmovd %eax, %xmm0 -; BF16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: popq %rax -; BF16-NEXT: retq +; F16-LABEL: add_constant2: +; F16: # %bb.0: +; F16-NEXT: pushq %rax +; F16-NEXT: vmovd %xmm0, %eax +; F16-NEXT: shll $16, %eax +; F16-NEXT: vmovd %eax, %xmm0 +; F16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; F16-NEXT: callq __truncsfbf2@PLT +; F16-NEXT: popq %rax +; F16-NEXT: retq %add = fadd bfloat %a, 1.0 ret bfloat %add } @@ -540,6 +541,121 @@ ; BF16-NEXT: popq %r15 ; BF16-NEXT: popq %rbp ; BF16-NEXT: retq +; +; FP16-LABEL: addv: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rbp +; FP16-NEXT: pushq %r15 +; FP16-NEXT: pushq %r14 +; FP16-NEXT: pushq %r13 +; FP16-NEXT: pushq %r12 +; FP16-NEXT: pushq %rbx +; FP16-NEXT: subq $40, %rsp +; FP16-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; FP16-NEXT: vmovw %xmm1, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm2 +; FP16-NEXT: vmovw %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm1 +; FP16-NEXT: vaddss %xmm2, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vpextrw $7, %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vpextrw $7, %xmm1, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovd %xmm0, %ebp +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vpextrw $6, %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vpextrw $6, %xmm1, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovd %xmm0, %r14d +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vpextrw $5, %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vpextrw $5, %xmm1, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovd %xmm0, %r15d +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vpextrw $4, %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vpextrw $4, %xmm1, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovd %xmm0, %r12d +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vpextrw $3, %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vpextrw $3, %xmm1, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovd %xmm0, %r13d +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vpextrw $2, %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vpextrw $2, %xmm1, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovd %xmm0, %ebx +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: vpextrw $1, %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vpextrw $1, %xmm1, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovd %xmm0, %eax +; FP16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FP16-NEXT: # xmm0 = mem[0],zero,zero,zero +; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; FP16-NEXT: vpinsrw $2, %ebx, %xmm0, %xmm0 +; FP16-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 +; FP16-NEXT: vpinsrw $4, %r12d, %xmm0, %xmm0 +; FP16-NEXT: vpinsrw $5, %r15d, %xmm0, %xmm0 +; FP16-NEXT: vpinsrw $6, %r14d, %xmm0, %xmm0 +; FP16-NEXT: vpinsrw $7, %ebp, %xmm0, %xmm0 +; FP16-NEXT: addq $40, %rsp +; FP16-NEXT: popq %rbx +; FP16-NEXT: popq %r12 +; FP16-NEXT: popq %r13 +; FP16-NEXT: popq %r14 +; FP16-NEXT: popq %r15 +; FP16-NEXT: popq %rbp +; FP16-NEXT: retq %add = fadd <8 x bfloat> %a, %b ret <8 x bfloat> %add } @@ -554,13 +670,13 @@ ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: retq ; -; BF16-LABEL: pr62997: -; BF16: # %bb.0: -; BF16-NEXT: vmovd %xmm1, %eax -; BF16-NEXT: vmovd %xmm0, %ecx -; BF16-NEXT: vmovd %ecx, %xmm0 -; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; BF16-NEXT: retq +; F16-LABEL: pr62997: +; F16: # %bb.0: +; F16-NEXT: vmovd %xmm1, %eax +; F16-NEXT: vmovd %xmm0, %ecx +; F16-NEXT: vmovd %ecx, %xmm0 +; F16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; F16-NEXT: retq %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0 %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1 ret <2 x bfloat> %2 @@ -575,10 +691,10 @@ ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: retq ; -; BF16-LABEL: pr63017: -; BF16: # %bb.0: -; BF16-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; BF16-NEXT: retq +; F16-LABEL: pr63017: +; F16: # %bb.0: +; F16-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; F16-NEXT: retq ret <32 x bfloat> zeroinitializer } @@ -1149,11 +1265,11 @@ ; SSE2-NEXT: popq %r14 ; SSE2-NEXT: retq ; -; BF16-LABEL: pr63017_2: -; BF16: # %bb.0: -; BF16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] -; BF16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} -; BF16-NEXT: retq +; F16-LABEL: pr63017_2: +; F16: # %bb.0: +; F16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] +; F16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} +; F16-NEXT: retq %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> ) ret <32 x bfloat> %1 } @@ -1173,14 +1289,241 @@ ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE2-NEXT: retq ; -; BF16-LABEL: pr62997_3: -; BF16: # %bb.0: -; BF16-NEXT: vmovd %xmm1, %eax -; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 -; BF16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; BF16-NEXT: retq +; F16-LABEL: pr62997_3: +; F16: # %bb.0: +; F16-NEXT: vmovd %xmm1, %eax +; F16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 +; F16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; F16-NEXT: retq %3 = insertelement <32 x bfloat> %0, bfloat %1, i64 1 ret <32 x bfloat> %3 } declare <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr, i32, <32 x i1>, <32 x bfloat>) + +define <4 x float> @pr64460_1(<4 x bfloat> %a) { +; SSE2-LABEL: pr64460_1: +; SSE2: # %bb.0: +; SSE2-NEXT: pextrw $1, %xmm0, %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pextrw $3, %xmm0, %eax +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; F16-LABEL: pr64460_1: +; F16: # %bb.0: +; F16-NEXT: vmovdqa %xmm0, %xmm0 +; F16-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [8,0,8,1,8,2,8,3,8,0,8,1,8,2,8,3] +; F16-NEXT: # ymm1 = mem[0,1,0,1] +; F16-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; F16-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; F16-NEXT: vzeroupper +; F16-NEXT: retq + %b = fpext <4 x bfloat> %a to <4 x float> + ret <4 x float> %b +} + +define <8 x float> @pr64460_2(<8 x bfloat> %a) { +; SSE2-LABEL: pr64460_2: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movq %xmm0, %rcx +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: shrq $32, %rsi +; SSE2-NEXT: movl %edx, %edi +; SSE2-NEXT: andl $-65536, %edi # imm = 0xFFFF0000 +; SSE2-NEXT: movd %edi, %xmm1 +; SSE2-NEXT: movl %edx, %edi +; SSE2-NEXT: shll $16, %edi +; SSE2-NEXT: movd %edi, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: shrq $48, %rdx +; SSE2-NEXT: shll $16, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: shll $16, %esi +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: andl $-65536, %edx # imm = 0xFFFF0000 +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shll $16, %edx +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: shrq $48, %rcx +; SSE2-NEXT: shll $16, %ecx +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE2-NEXT: retq +; +; F16-LABEL: pr64460_2: +; F16: # %bb.0: +; F16-NEXT: vmovdqa %xmm0, %xmm0 +; F16-NEXT: vmovdqa {{.*#+}} ymm1 = [8,0,8,1,8,2,8,3,8,4,8,5,8,6,8,7] +; F16-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; F16-NEXT: retq + %b = fpext <8 x bfloat> %a to <8 x float> + ret <8 x float> %b +} + +define <16 x float> @pr64460_3(<16 x bfloat> %a) { +; SSE2-LABEL: pr64460_3: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm1, %rdi +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1] +; SSE2-NEXT: movq %xmm1, %rcx +; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: movq %xmm0, %r9 +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: movq %rsi, %rdx +; SSE2-NEXT: shrq $32, %rdx +; SSE2-NEXT: movq %rdi, %r8 +; SSE2-NEXT: shrq $32, %r8 +; SSE2-NEXT: movq %r9, %r10 +; SSE2-NEXT: shrq $32, %r10 +; SSE2-NEXT: movl %r9d, %r11d +; SSE2-NEXT: andl $-65536, %r11d # imm = 0xFFFF0000 +; SSE2-NEXT: movd %r11d, %xmm1 +; SSE2-NEXT: movl %r9d, %r11d +; SSE2-NEXT: shll $16, %r11d +; SSE2-NEXT: movd %r11d, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: shrq $48, %r9 +; SSE2-NEXT: shll $16, %r9d +; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: shll $16, %r10d +; SSE2-NEXT: movd %r10d, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: movl %edi, %r9d +; SSE2-NEXT: andl $-65536, %r9d # imm = 0xFFFF0000 +; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: movl %edi, %r9d +; SSE2-NEXT: shll $16, %r9d +; SSE2-NEXT: movd %r9d, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: shrq $48, %rdi +; SSE2-NEXT: shll $16, %edi +; SSE2-NEXT: movd %edi, %xmm1 +; SSE2-NEXT: shll $16, %r8d +; SSE2-NEXT: movd %r8d, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: movl %esi, %edi +; SSE2-NEXT: andl $-65536, %edi # imm = 0xFFFF0000 +; SSE2-NEXT: movd %edi, %xmm3 +; SSE2-NEXT: movl %esi, %edi +; SSE2-NEXT: shll $16, %edi +; SSE2-NEXT: movd %edi, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: shrq $48, %rsi +; SSE2-NEXT: shll $16, %esi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: shll $16, %edx +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: andl $-65536, %edx # imm = 0xFFFF0000 +; SSE2-NEXT: movd %edx, %xmm4 +; SSE2-NEXT: movl %ecx, %edx +; SSE2-NEXT: shll $16, %edx +; SSE2-NEXT: movd %edx, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: shrq $48, %rcx +; SSE2-NEXT: shll $16, %ecx +; SSE2-NEXT: movd %ecx, %xmm4 +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] +; SSE2-NEXT: retq +; +; F16-LABEL: pr64460_3: +; F16: # %bb.0: +; F16-NEXT: vmovdqa %ymm0, %ymm0 +; F16-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7,16,8,16,9,16,10,16,11,16,12,16,13,16,14,16,15] +; F16-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; F16-NEXT: retq + %b = fpext <16 x bfloat> %a to <16 x float> + ret <16 x float> %b +} + +define <8 x double> @pr64460_4(<8 x bfloat> %a) { +; SSE2-LABEL: pr64460_4: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %rdx, %rax +; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: movq %rdx, %rcx +; SSE2-NEXT: shrq $48, %rcx +; SSE2-NEXT: movq %rsi, %rdi +; SSE2-NEXT: shrq $32, %rdi +; SSE2-NEXT: movq %rsi, %r8 +; SSE2-NEXT: shrq $48, %r8 +; SSE2-NEXT: movl %esi, %r9d +; SSE2-NEXT: andl $-65536, %r9d # imm = 0xFFFF0000 +; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: cvtss2sd %xmm0, %xmm1 +; SSE2-NEXT: shll $16, %esi +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: cvtss2sd %xmm0, %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: shll $16, %r8d +; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: cvtss2sd %xmm1, %xmm2 +; SSE2-NEXT: shll $16, %edi +; SSE2-NEXT: movd %edi, %xmm1 +; SSE2-NEXT: cvtss2sd %xmm1, %xmm1 +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movl %edx, %esi +; SSE2-NEXT: andl $-65536, %esi # imm = 0xFFFF0000 +; SSE2-NEXT: movd %esi, %xmm2 +; SSE2-NEXT: cvtss2sd %xmm2, %xmm3 +; SSE2-NEXT: shll $16, %edx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: cvtss2sd %xmm2, %xmm2 +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE2-NEXT: shll $16, %ecx +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: cvtss2sd %xmm3, %xmm4 +; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: cvtss2sd %xmm3, %xmm3 +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE2-NEXT: retq +; +; F16-LABEL: pr64460_4: +; F16: # %bb.0: +; F16-NEXT: vmovdqa %xmm0, %xmm0 +; F16-NEXT: vmovdqa {{.*#+}} ymm1 = [8,0,8,1,8,2,8,3,8,4,8,5,8,6,8,7] +; F16-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; F16-NEXT: vcvtps2pd %ymm0, %zmm0 +; F16-NEXT: retq + %b = fpext <8 x bfloat> %a to <8 x double> + ret <8 x double> %b +}