diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2057,6 +2057,7 @@ setTargetDAGCombine(ISD::MSCATTER); setTargetDAGCombine(ISD::MGATHER); setTargetDAGCombine(ISD::FP16_TO_FP); + setTargetDAGCombine(ISD::FP_EXTEND); computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -29012,6 +29013,18 @@ N->dump(&DAG); #endif llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::CVTPH2PS: { + EVT VT = N->getValueType(0); + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo); + Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Results.push_back(Res); + return; + } case ISD::CTPOP: { assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); // Use a v2i64 if possible. @@ -43774,6 +43787,22 @@ return SDValue(); } +static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Src = N->getOperand(0); + + if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) { + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedElts = APInt::getLowBitsSet(8, 4); + if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, + DCI)) + return SDValue(N, 0); + } + + return SDValue(); +} + // Try to combine sext_in_reg of a cmov of constants by extending the constants. static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); @@ -46640,6 +46669,57 @@ DAG.getIntPtrConstant(0, dl)); } +static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) + return SDValue(); + + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16) + return SDValue(); + + if (VT.getVectorElementType() != MVT::f32 && + VT.getVectorElementType() != MVT::f64) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 1 || !isPowerOf2_32(NumElts)) + return SDValue(); + + SDLoc dl(N); + + // Convert the input to vXi16. + EVT IntVT = SrcVT.changeVectorElementTypeToInteger(); + Src = DAG.getBitcast(IntVT, Src); + + // Widen to at least 8 input elements. + if (NumElts < 8) { + unsigned NumConcats = 8 / NumElts; + SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT) + : DAG.getConstant(0, dl, IntVT); + SmallVector Ops(NumConcats, Fill); + Ops[0] = Src; + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops); + } + + // Destination is vXf32 with at least 4 elements. + EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, + std::max(4U, NumElts)); + SDValue Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src); + + if (NumElts < 4) { + assert(NumElts == 2 && "Unexpected size"); + Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt, + DAG.getIntPtrConstant(0, dl)); + } + + // Extend to the original VT if necessary. + return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -46707,6 +46787,7 @@ case X86ISD::CVTP2UI: case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI); + case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); @@ -46792,6 +46873,7 @@ case X86ISD::KSHIFTL: case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget); + case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget); } return SDValue(); diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -413,29 +413,7 @@ ; ; BWON-F16C-LABEL: test_extend32_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movl (%rdi), %eax -; BWON-F16C-NEXT: movl 4(%rdi), %ecx -; BWON-F16C-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; BWON-F16C-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; BWON-F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; BWON-F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 -; BWON-F16C-NEXT: vpextrw $1, %xmm1, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm2 -; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vmovd %xmm1, %eax -; BWON-F16C-NEXT: movzwl %ax, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm1 -; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; BWON-F16C-NEXT: vmovd %xmm0, %eax -; BWON-F16C-NEXT: movzwl %ax, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm2 -; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; BWON-F16C-NEXT: vpextrw $1, %xmm0, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; BWON-F16C-NEXT: vcvtph2ps (%rdi), %xmm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_extend32_vec4: @@ -525,25 +503,8 @@ ; ; BWON-F16C-LABEL: test_extend64_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: movzwl 2(%rdi), %ecx -; BWON-F16C-NEXT: movzwl 4(%rdi), %edx -; BWON-F16C-NEXT: movzwl 6(%rdi), %esi -; BWON-F16C-NEXT: vmovd %esi, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %edx, %xmm1 -; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; BWON-F16C-NEXT: vmovd %ecx, %xmm1 -; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; BWON-F16C-NEXT: vmovd %eax, %xmm2 -; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; BWON-F16C-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; BWON-F16C-NEXT: vcvtph2ps (%rdi), %xmm0 +; BWON-F16C-NEXT: vcvtps2pd %xmm0, %ymm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_extend64_vec4: diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -24,25 +24,7 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { ; ALL-LABEL: cvt_4i16_to_4f32: ; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: movl %eax, %edx -; ALL-NEXT: shrl $16, %edx -; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movzwl %ax, %edx -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; ALL-NEXT: shrq $48, %rax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; ALL-NEXT: retq %1 = bitcast <4 x i16> %a0 to <4 x half> %2 = fpext <4 x half> %1 to <4 x float> @@ -52,25 +34,7 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { ; ALL-LABEL: cvt_8i16_to_4f32: ; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: movl %eax, %edx -; ALL-NEXT: shrl $16, %edx -; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movzwl %ax, %edx -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; ALL-NEXT: shrq $48, %rax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; ALL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = bitcast <4 x i16> %1 to <4 x half> @@ -81,45 +45,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { ; ALL-LABEL: cvt_8i16_to_8f32: ; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: vpextrq $1, %xmm0, %rdx -; ALL-NEXT: movq %rdx, %rsi -; ALL-NEXT: shrq $32, %rsi -; ALL-NEXT: movl %edx, %edi -; ALL-NEXT: shrl $16, %edi -; ALL-NEXT: vmovd %edi, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movzwl %dx, %edi -; ALL-NEXT: vmovd %edi, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; ALL-NEXT: movzwl %si, %esi -; ALL-NEXT: vmovd %esi, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; ALL-NEXT: movl %eax, %edx -; ALL-NEXT: shrl $16, %edx -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movzwl %ax, %edx -; ALL-NEXT: vmovd %edx, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; ALL-NEXT: shrq $48, %rax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtph2ps %xmm0, %ymm0 ; ALL-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> %2 = fpext <8 x half> %1 to <8 x float> @@ -129,252 +55,23 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; AVX1-LABEL: cvt_16i16_to_16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %r10 -; AVX1-NEXT: movq %r10, %r8 -; AVX1-NEXT: shrq $32, %r8 -; AVX1-NEXT: vpextrq $1, %xmm1, %rdx -; AVX1-NEXT: movq %rdx, %r9 -; AVX1-NEXT: shrq $32, %r9 -; AVX1-NEXT: vmovq %xmm0, %rdi -; AVX1-NEXT: movq %rdi, %r11 -; AVX1-NEXT: shrq $32, %r11 -; AVX1-NEXT: vpextrq $1, %xmm0, %rsi -; AVX1-NEXT: movq %rsi, %rax -; AVX1-NEXT: shrq $32, %rax -; AVX1-NEXT: movl %esi, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: movzwl %si, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX1-NEXT: shrq $48, %rsi -; AVX1-NEXT: vmovd %esi, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: movzwl %di, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: movzwl %r11w, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: shrq $48, %rdi -; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: movzwl %dx, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: movzwl %r9w, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: shrq $48, %rdx -; AVX1-NEXT: vmovd %edx, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: movl %r10d, %eax -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: movzwl %r10w, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX1-NEXT: movzwl %r8w, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX1-NEXT: shrq $48, %r10 -; AVX1-NEXT: vmovd %r10d, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtph2ps %xmm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: cvt_16i16_to_16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %r10 -; AVX2-NEXT: movq %r10, %r8 -; AVX2-NEXT: shrq $32, %r8 -; AVX2-NEXT: vpextrq $1, %xmm1, %rdx -; AVX2-NEXT: movq %rdx, %r9 -; AVX2-NEXT: shrq $32, %r9 -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: movq %rdi, %r11 -; AVX2-NEXT: shrq $32, %r11 -; AVX2-NEXT: vpextrq $1, %xmm0, %rsi -; AVX2-NEXT: movq %rsi, %rax -; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: movzwl %si, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX2-NEXT: shrq $48, %rsi -; AVX2-NEXT: vmovd %esi, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: movzwl %di, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX2-NEXT: movzwl %r11w, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX2-NEXT: shrq $48, %rdi -; AVX2-NEXT: vmovd %edi, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: movzwl %dx, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX2-NEXT: movzwl %r9w, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX2-NEXT: shrq $48, %rdx -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: movzwl %r10w, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-NEXT: movzwl %r8w, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: shrq $48, %r10 -; AVX2-NEXT: vmovd %r10d, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vcvtph2ps %xmm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vcvtph2ps %xmm0, %ymm1 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_16i16_to_16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm0, %r10 -; AVX512-NEXT: movq %r10, %r8 -; AVX512-NEXT: shrq $32, %r8 -; AVX512-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512-NEXT: movq %rdx, %r9 -; AVX512-NEXT: shrq $32, %r9 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rdi -; AVX512-NEXT: movq %rdi, %r11 -; AVX512-NEXT: shrq $32, %r11 -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512-NEXT: movq %rsi, %rax -; AVX512-NEXT: shrq $32, %rax -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: shrl $16, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: movzwl %si, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512-NEXT: movzwl %ax, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512-NEXT: shrq $48, %rsi -; AVX512-NEXT: vmovd %esi, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: shrl $16, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: movzwl %di, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512-NEXT: movzwl %r11w, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512-NEXT: shrq $48, %rdi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: shrl $16, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: movzwl %dx, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512-NEXT: movzwl %r9w, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512-NEXT: shrq $48, %rdx -; AVX512-NEXT: vmovd %edx, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: shrl $16, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: movzwl %r10w, %eax -; AVX512-NEXT: vmovd %eax, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512-NEXT: movzwl %r8w, %eax -; AVX512-NEXT: vmovd %eax, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512-NEXT: shrq $48, %r10 -; AVX512-NEXT: vmovd %r10d, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512-NEXT: retq %1 = bitcast <16 x i16> %a0 to <16 x half> %2 = fpext <16 x half> %1 to <16 x float> @@ -401,29 +98,7 @@ define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_4i16_to_4f32: ; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: movl 4(%rdi), %ecx -; ALL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 -; ALL-NEXT: vpextrw $1, %xmm1, %eax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; ALL-NEXT: vpextrw $1, %xmm0, %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; ALL-NEXT: vcvtph2ps (%rdi), %xmm0 ; ALL-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a0 %2 = bitcast <4 x i16> %1 to <4 x half> @@ -434,25 +109,7 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_8i16_to_4f32: ; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: movl %eax, %edx -; ALL-NEXT: shrl $16, %edx -; ALL-NEXT: vmovd %edx, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movzwl %ax, %edx -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; ALL-NEXT: shrq $48, %rax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; ALL-NEXT: vcvtph2ps (%rdi), %xmm0 ; ALL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> @@ -464,53 +121,7 @@ define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_8i16_to_8f32: ; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: movl 4(%rdi), %ecx -; ALL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: movl 12(%rdi), %eax -; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: movl 8(%rdi), %eax -; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 -; ALL-NEXT: vpextrw $1, %xmm3, %eax -; ALL-NEXT: vmovd %eax, %xmm4 -; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 -; ALL-NEXT: vmovd %xmm3, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm4 -; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 -; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; ALL-NEXT: vpextrw $1, %xmm2, %eax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] -; ALL-NEXT: vpextrw $1, %xmm1, %eax -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2,3] -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; ALL-NEXT: vpextrw $1, %xmm0, %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; ALL-NEXT: vcvtph2ps (%rdi), %ymm0 ; ALL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half> @@ -521,405 +132,20 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { ; AVX1-LABEL: load_cvt_16i16_to_16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rax -; AVX1-NEXT: movl 20(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl 16(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl 28(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl 24(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl (%rdi), %eax -; AVX1-NEXT: movl 4(%rdi), %ecx -; AVX1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl 12(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl 8(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 -; AVX1-NEXT: vpextrw $1, %xmm7, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm7, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] -; AVX1-NEXT: vmovd %xmm6, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] -; AVX1-NEXT: vpextrw $1, %xmm6, %eax -; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[0] -; AVX1-NEXT: vpextrw $1, %xmm5, %eax -; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: vmovd %xmm5, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm5 -; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpextrw $1, %xmm4, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm4, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm4 -; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] -; AVX1-NEXT: vmovd %xmm3, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm4 -; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-NEXT: vpextrw $1, %xmm3, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; AVX1-NEXT: vpextrw $1, %xmm2, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm2, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX1-NEXT: vmovd %xmm8, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX1-NEXT: vpextrw $1, %xmm8, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: popq %rax +; AVX1-NEXT: vcvtph2ps (%rdi), %ymm0 +; AVX1-NEXT: vcvtph2ps 16(%rdi), %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_cvt_16i16_to_16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rax -; AVX2-NEXT: movl 20(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl 16(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl 28(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl 24(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl (%rdi), %eax -; AVX2-NEXT: movl 4(%rdi), %ecx -; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl 12(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl 8(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 -; AVX2-NEXT: vpextrw $1, %xmm7, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm7, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] -; AVX2-NEXT: vmovd %xmm6, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] -; AVX2-NEXT: vpextrw $1, %xmm6, %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[0] -; AVX2-NEXT: vpextrw $1, %xmm5, %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: vmovd %xmm5, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpextrw $1, %xmm4, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm4, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm4 -; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] -; AVX2-NEXT: vmovd %xmm3, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm4 -; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX2-NEXT: vpextrw $1, %xmm3, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; AVX2-NEXT: vpextrw $1, %xmm2, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vmovd %xmm2, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX2-NEXT: vmovd %xmm8, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: vpextrw $1, %xmm8, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-NEXT: popq %rax +; AVX2-NEXT: vcvtph2ps (%rdi), %ymm0 +; AVX2-NEXT: vcvtph2ps 16(%rdi), %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_cvt_16i16_to_16f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rax -; AVX512F-NEXT: movl (%rdi), %eax -; AVX512F-NEXT: movl 4(%rdi), %ecx -; AVX512F-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 12(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 8(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 20(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 16(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 28(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 24(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 -; AVX512F-NEXT: vpextrw $1, %xmm7, %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm7, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 -; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[2,3] -; AVX512F-NEXT: vmovd %xmm6, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 -; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0],xmm0[3] -; AVX512F-NEXT: vpextrw $1, %xmm6, %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[0] -; AVX512F-NEXT: vpextrw $1, %xmm5, %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512F-NEXT: vmovd %xmm5, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm5 -; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] -; AVX512F-NEXT: vpextrw $1, %xmm4, %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 -; AVX512F-NEXT: vpextrw $1, %xmm3, %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[2,3] -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX512F-NEXT: vpextrw $1, %xmm2, %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] -; AVX512F-NEXT: vpextrw $1, %xmm1, %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512F-NEXT: vmovd %xmm8, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vpextrw $1, %xmm8, %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-NEXT: popq %rax -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: load_cvt_16i16_to_16f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: pushq %rax -; AVX512VL-NEXT: movl (%rdi), %eax -; AVX512VL-NEXT: movl 4(%rdi), %ecx -; AVX512VL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 12(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 8(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 20(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 16(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 28(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 24(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 -; AVX512VL-NEXT: vpextrw $1, %xmm7, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm7, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[2,3] -; AVX512VL-NEXT: vmovd %xmm6, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0],xmm0[3] -; AVX512VL-NEXT: vpextrw $1, %xmm6, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[0] -; AVX512VL-NEXT: vpextrw $1, %xmm5, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: vmovd %xmm5, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm5 -; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] -; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] -; AVX512VL-NEXT: vpextrw $1, %xmm4, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX512VL-NEXT: vpextrw $1, %xmm3, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: vmovd %xmm3, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] -; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2,3] -; AVX512VL-NEXT: vmovd %xmm8, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX512VL-NEXT: vpextrw $1, %xmm8, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512VL-NEXT: popq %rax -; AVX512VL-NEXT: retq +; AVX512-LABEL: load_cvt_16i16_to_16f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtph2ps (%rdi), %zmm0 +; AVX512-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a0 %2 = bitcast <16 x i16> %1 to <16 x half> %3 = fpext <16 x half> %2 to <16 x float> @@ -946,16 +172,9 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind { ; ALL-LABEL: cvt_2i16_to_2f64: ; ALL: # %bb.0: -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movzwl %ax, %ecx -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vcvtps2pd %xmm0, %xmm0 ; ALL-NEXT: retq %1 = bitcast <2 x i16> %a0 to <2 x half> %2 = fpext <2 x half> %1 to <2 x double> @@ -965,30 +184,8 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { ; ALL-LABEL: cvt_4i16_to_4f64: ; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movzwl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovd %esi, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtps2pd %xmm0, %ymm0 ; ALL-NEXT: retq %1 = bitcast <4 x i16> %a0 to <4 x half> %2 = fpext <4 x half> %1 to <4 x double> @@ -998,16 +195,9 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind { ; ALL-LABEL: cvt_8i16_to_2f64: ; ALL: # %bb.0: -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movzwl %ax, %ecx -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vcvtps2pd %xmm0, %xmm0 ; ALL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> %2 = bitcast <2 x i16> %1 to <2 x half> @@ -1018,30 +208,8 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { ; ALL-LABEL: cvt_8i16_to_4f64: ; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movzwl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovd %esi, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtps2pd %xmm0, %ymm0 ; ALL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = bitcast <4 x i16> %1 to <4 x half> @@ -1052,159 +220,24 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX1-LABEL: cvt_8i16_to_8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: movq %rdx, %r9 -; AVX1-NEXT: movq %rdx, %r10 -; AVX1-NEXT: movzwl %dx, %r8d -; AVX1-NEXT: # kill: def $edx killed $edx killed $rdx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: shrq $32, %r9 -; AVX1-NEXT: shrq $48, %r10 -; AVX1-NEXT: vmovq %xmm0, %rdi -; AVX1-NEXT: movq %rdi, %rsi -; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: movzwl %di, %ecx -; AVX1-NEXT: # kill: def $edi killed $edi killed $rdi -; AVX1-NEXT: shrl $16, %edi -; AVX1-NEXT: shrq $32, %rsi -; AVX1-NEXT: shrq $48, %rax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: movzwl %si, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovd %r10d, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: movzwl %r9w, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vmovd %edx, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %r8d, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtph2ps %xmm0, %ymm1 +; AVX1-NEXT: vcvtps2pd %xmm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: cvt_8i16_to_8f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: movq %rdx, %r9 -; AVX2-NEXT: movq %rdx, %r10 -; AVX2-NEXT: movzwl %dx, %r8d -; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx -; AVX2-NEXT: shrl $16, %edx -; AVX2-NEXT: shrq $32, %r9 -; AVX2-NEXT: shrq $48, %r10 -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: movq %rdi, %rsi -; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: movzwl %di, %ecx -; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi -; AVX2-NEXT: shrl $16, %edi -; AVX2-NEXT: shrq $32, %rsi -; AVX2-NEXT: shrq $48, %rax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: movzwl %si, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovd %edi, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovd %r10d, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: movzwl %r9w, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %r8d, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vcvtph2ps %xmm0, %ymm1 +; AVX2-NEXT: vcvtps2pd %xmm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_8i16_to_8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm0, %rdx -; AVX512-NEXT: movq %rdx, %r9 -; AVX512-NEXT: movq %rdx, %r10 -; AVX512-NEXT: movzwl %dx, %r8d -; AVX512-NEXT: # kill: def $edx killed $edx killed $rdx -; AVX512-NEXT: shrl $16, %edx -; AVX512-NEXT: shrq $32, %r9 -; AVX512-NEXT: shrq $48, %r10 -; AVX512-NEXT: vpextrq $1, %xmm0, %rdi -; AVX512-NEXT: movq %rdi, %rsi -; AVX512-NEXT: movq %rdi, %rax -; AVX512-NEXT: movzwl %di, %ecx -; AVX512-NEXT: # kill: def $edi killed $edi killed $rdi -; AVX512-NEXT: shrl $16, %edi -; AVX512-NEXT: shrq $32, %rsi -; AVX512-NEXT: shrq $48, %rax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: movzwl %si, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovd %edi, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %ecx, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovd %r10d, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: movzwl %r9w, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vmovd %edx, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %r8d, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0 +; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 ; AVX512-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> %2 = fpext <8 x half> %1 to <8 x double> @@ -1232,15 +265,10 @@ define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_2i16_to_2f64: ; ALL: # %bb.0: -; ALL-NEXT: movzwl (%rdi), %eax -; ALL-NEXT: movzwl 2(%rdi), %ecx -; ALL-NEXT: vmovd %ecx, %xmm0 +; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vcvtps2pd %xmm0, %xmm0 ; ALL-NEXT: retq %1 = load <2 x i16>, <2 x i16>* %a0 %2 = bitcast <2 x i16> %1 to <2 x half> @@ -1251,25 +279,8 @@ define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_4i16_to_4f64: ; ALL: # %bb.0: -; ALL-NEXT: movzwl (%rdi), %eax -; ALL-NEXT: movzwl 2(%rdi), %ecx -; ALL-NEXT: movzwl 4(%rdi), %edx -; ALL-NEXT: movzwl 6(%rdi), %esi -; ALL-NEXT: vmovd %esi, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtph2ps (%rdi), %xmm0 +; ALL-NEXT: vcvtps2pd %xmm0, %ymm0 ; ALL-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a0 %2 = bitcast <4 x i16> %1 to <4 x half> @@ -1280,30 +291,8 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_8i16_to_4f64: ; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movzwl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: vmovd %edx, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovd %esi, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtph2ps (%rdi), %xmm0 +; ALL-NEXT: vcvtps2pd %xmm0, %ymm0 ; ALL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> @@ -1315,129 +304,24 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind { ; AVX1-LABEL: load_cvt_8i16_to_8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: movzwl 8(%rdi), %r8d -; AVX1-NEXT: movzwl 10(%rdi), %r9d -; AVX1-NEXT: movzwl 12(%rdi), %r10d -; AVX1-NEXT: movzwl 14(%rdi), %esi -; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: movzwl 2(%rdi), %ecx -; AVX1-NEXT: movzwl 4(%rdi), %edx -; AVX1-NEXT: movzwl 6(%rdi), %edi -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %edx, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovd %ecx, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovd %esi, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %r10d, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vmovd %r9d, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %r8d, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtph2ps (%rdi), %ymm1 +; AVX1-NEXT: vcvtps2pd %xmm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_cvt_8i16_to_8f64: ; AVX2: # %bb.0: -; AVX2-NEXT: movzwl 8(%rdi), %r8d -; AVX2-NEXT: movzwl 10(%rdi), %r9d -; AVX2-NEXT: movzwl 12(%rdi), %r10d -; AVX2-NEXT: movzwl 14(%rdi), %esi -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: movzwl 2(%rdi), %ecx -; AVX2-NEXT: movzwl 4(%rdi), %edx -; AVX2-NEXT: movzwl 6(%rdi), %edi -; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %edx, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovd %ecx, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovd %esi, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %r10d, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vmovd %r9d, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %r8d, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vcvtph2ps (%rdi), %ymm1 +; AVX2-NEXT: vcvtps2pd %xmm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_cvt_8i16_to_8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: movzwl (%rdi), %r8d -; AVX512-NEXT: movzwl 2(%rdi), %r9d -; AVX512-NEXT: movzwl 4(%rdi), %r10d -; AVX512-NEXT: movzwl 6(%rdi), %esi -; AVX512-NEXT: movzwl 8(%rdi), %eax -; AVX512-NEXT: movzwl 10(%rdi), %ecx -; AVX512-NEXT: movzwl 12(%rdi), %edx -; AVX512-NEXT: movzwl 14(%rdi), %edi -; AVX512-NEXT: vmovd %edi, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %edx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovd %esi, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %r10d, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vmovd %r9d, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %r8d, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vcvtph2ps (%rdi), %ymm0 +; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 ; AVX512-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half>