diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2057,6 +2057,8 @@ setTargetDAGCombine(ISD::MSCATTER); setTargetDAGCombine(ISD::MGATHER); setTargetDAGCombine(ISD::FP16_TO_FP); + setTargetDAGCombine(ISD::FP_EXTEND); + setTargetDAGCombine(ISD::FP_ROUND); computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -28830,6 +28832,20 @@ return Tmp.first; } +// Custom split CVTPS2PH with wide types. +static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) { + SDLoc(dl); + EVT VT = Op.getValueType(); + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + SDValue RC = Op.getOperand(1); + Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC); + Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); +} + /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -28965,8 +28981,8 @@ case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG); - case ISD::ADDRSPACECAST: - return LowerADDRSPACECAST(Op, DAG); + case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); + case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); } } @@ -29012,6 +29028,18 @@ N->dump(&DAG); #endif llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::CVTPH2PS: { + EVT VT = N->getValueType(0); + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo); + Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi); + SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + Results.push_back(Res); + return; + } case ISD::CTPOP: { assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!"); // Use a v2i64 if possible. @@ -35803,10 +35831,9 @@ // TODO: Can we generalize this using computeKnownBits. if (N->getOpcode() == X86ISD::VZEXT_MOVL && (VT == MVT::v2f64 || VT == MVT::v2i64) && - N->getOperand(0).getOpcode() == ISD::BITCAST && - (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 || - N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) { + N->getOperand(0).getOpcode() == ISD::BITCAST) { SDValue In = N->getOperand(0).getOperand(0); + EVT InVT = In.getValueType(); switch (In.getOpcode()) { default: break; @@ -35817,8 +35844,9 @@ case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P: case X86ISD::VFPROUND: case X86ISD::VMFPROUND: - if (In.getOperand(0).getValueType() == MVT::v2f64 || - In.getOperand(0).getValueType() == MVT::v2i64) + if ((InVT == MVT::v4f32 || InVT == MVT::v4i32) && + (In.getOperand(0).getValueType() == MVT::v2f64 || + In.getOperand(0).getValueType() == MVT::v2i64)) return N->getOperand(0); // return the bitcast break; case X86ISD::STRICT_CVTTP2SI: @@ -35826,9 +35854,19 @@ case X86ISD::STRICT_CVTSI2P: case X86ISD::STRICT_CVTUI2P: case X86ISD::STRICT_VFPROUND: - if (In.getOperand(1).getValueType() == MVT::v2f64 || - In.getOperand(1).getValueType() == MVT::v2i64) - return N->getOperand(0); + if ((InVT == MVT::v4f32 || InVT == MVT::v4i32) && + (In.getOperand(1).getValueType() == MVT::v2f64 || + In.getOperand(1).getValueType() == MVT::v2i64)) + return N->getOperand(0); // return the bitcast + break; + case X86ISD::CVTPS2PH: + case X86ISD::MCVTPS2PH: + if (InVT == MVT::v8i16 && In.getOperand(0).getValueType() == MVT::v4f32) + return N->getOperand(0); // return the bitcast + break; + case X86ISD::STRICT_CVTPS2PH: + if (InVT == MVT::v8i16 && In.getOperand(1).getValueType() == MVT::v4f32) + return N->getOperand(0); // return the bitcast break; } } @@ -43774,6 +43812,22 @@ return SDValue(); } +static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Src = N->getOperand(0); + + if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) { + APInt KnownUndef, KnownZero; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + APInt DemandedElts = APInt::getLowBitsSet(8, 4); + if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, + DCI)) + return SDValue(N, 0); + } + + return SDValue(); +} + // Try to combine sext_in_reg of a cmov of constants by extending the constants. static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG); @@ -46640,6 +46694,97 @@ DAG.getIntPtrConstant(0, dl)); } +static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) + return SDValue(); + + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16) + return SDValue(); + + if (VT.getVectorElementType() != MVT::f32 && + VT.getVectorElementType() != MVT::f64) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 1 || !isPowerOf2_32(NumElts)) + return SDValue(); + + SDLoc dl(N); + + // Convert the input to vXi16. + EVT IntVT = SrcVT.changeVectorElementTypeToInteger(); + Src = DAG.getBitcast(IntVT, Src); + + // Widen to at least 8 input elements. + if (NumElts < 8) { + unsigned NumConcats = 8 / NumElts; + SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT) + : DAG.getConstant(0, dl, IntVT); + SmallVector Ops(NumConcats, Fill); + Ops[0] = Src; + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops); + } + + // Destination is vXf32 with at least 4 elements. + EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, + std::max(4U, NumElts)); + SDValue Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src); + + if (NumElts < 4) { + assert(NumElts == 2 && "Unexpected size"); + Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt, + DAG.getIntPtrConstant(0, dl)); + } + + // Extend to the original VT if necessary. + return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt); +} + +static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) + return SDValue(); + + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(0); + EVT SrcVT = Src.getValueType(); + + if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 || + SrcVT.getVectorElementType() != MVT::f32) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 1 || !isPowerOf2_32(NumElts)) + return SDValue(); + + SDLoc dl(N); + + // Widen to at least 4 input elements. + if (NumElts < 4) + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getConstantFP(0.0, dl, SrcVT)); + + // Destination is v8i16 with at least 8 elements. + EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, + std::max(8U, NumElts)); + SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, + DAG.getTargetConstant(4, dl, MVT::i32)); + + // Extract down to real number of elements. + if (NumElts < 8) { + EVT IntVT = VT.changeVectorElementTypeToInteger(); + Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt, + DAG.getIntPtrConstant(0, dl)); + } + + return DAG.getBitcast(VT, Cvt); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -46707,6 +46852,7 @@ case X86ISD::CVTP2UI: case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: return combineCVTP2I_CVTTP2I(N, DAG, DCI); + case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI); case X86ISD::BT: return combineBT(N, DAG, DCI); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); @@ -46792,6 +46938,8 @@ case X86ISD::KSHIFTL: case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget); + case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget); + case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget); } return SDValue(); diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -413,29 +413,7 @@ ; ; BWON-F16C-LABEL: test_extend32_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movl (%rdi), %eax -; BWON-F16C-NEXT: movl 4(%rdi), %ecx -; BWON-F16C-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; BWON-F16C-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; BWON-F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; BWON-F16C-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 -; BWON-F16C-NEXT: vpextrw $1, %xmm1, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm2 -; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vmovd %xmm1, %eax -; BWON-F16C-NEXT: movzwl %ax, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm1 -; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; BWON-F16C-NEXT: vmovd %xmm0, %eax -; BWON-F16C-NEXT: movzwl %ax, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm2 -; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; BWON-F16C-NEXT: vpextrw $1, %xmm0, %eax -; BWON-F16C-NEXT: vmovd %eax, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; BWON-F16C-NEXT: vcvtph2ps (%rdi), %xmm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_extend32_vec4: @@ -525,25 +503,8 @@ ; ; BWON-F16C-LABEL: test_extend64_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: movzwl (%rdi), %eax -; BWON-F16C-NEXT: movzwl 2(%rdi), %ecx -; BWON-F16C-NEXT: movzwl 4(%rdi), %edx -; BWON-F16C-NEXT: movzwl 6(%rdi), %esi -; BWON-F16C-NEXT: vmovd %esi, %xmm0 -; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %edx, %xmm1 -; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; BWON-F16C-NEXT: vmovd %ecx, %xmm1 -; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; BWON-F16C-NEXT: vmovd %eax, %xmm2 -; BWON-F16C-NEXT: vcvtph2ps %xmm2, %xmm2 -; BWON-F16C-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; BWON-F16C-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; BWON-F16C-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; BWON-F16C-NEXT: vcvtph2ps (%rdi), %xmm0 +; BWON-F16C-NEXT: vcvtps2pd %xmm0, %ymm0 ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_extend64_vec4: @@ -656,17 +617,7 @@ ; ; BWON-F16C-LABEL: test_trunc32_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; BWON-F16C-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; BWON-F16C-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rdi) -; BWON-F16C-NEXT: vpextrw $0, %xmm3, 6(%rdi) -; BWON-F16C-NEXT: vpextrw $0, %xmm2, 4(%rdi) -; BWON-F16C-NEXT: vpextrw $0, %xmm1, 2(%rdi) +; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, (%rdi) ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_trunc32_vec4: diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -24,25 +24,7 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind { ; ALL-LABEL: cvt_4i16_to_4f32: ; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: movl %eax, %edx -; ALL-NEXT: shrl $16, %edx -; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movzwl %ax, %edx -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; ALL-NEXT: shrq $48, %rax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; ALL-NEXT: retq %1 = bitcast <4 x i16> %a0 to <4 x half> %2 = fpext <4 x half> %1 to <4 x float> @@ -52,25 +34,7 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind { ; ALL-LABEL: cvt_8i16_to_4f32: ; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: movl %eax, %edx -; ALL-NEXT: shrl $16, %edx -; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movzwl %ax, %edx -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; ALL-NEXT: shrq $48, %rax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; ALL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = bitcast <4 x i16> %1 to <4 x half> @@ -81,45 +45,7 @@ define <8 x float> @cvt_8i16_to_8f32(<8 x i16> %a0) nounwind { ; ALL-LABEL: cvt_8i16_to_8f32: ; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: vpextrq $1, %xmm0, %rdx -; ALL-NEXT: movq %rdx, %rsi -; ALL-NEXT: shrq $32, %rsi -; ALL-NEXT: movl %edx, %edi -; ALL-NEXT: shrl $16, %edi -; ALL-NEXT: vmovd %edi, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movzwl %dx, %edi -; ALL-NEXT: vmovd %edi, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; ALL-NEXT: movzwl %si, %esi -; ALL-NEXT: vmovd %esi, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; ALL-NEXT: movl %eax, %edx -; ALL-NEXT: shrl $16, %edx -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: movzwl %ax, %edx -; ALL-NEXT: vmovd %edx, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; ALL-NEXT: shrq $48, %rax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtph2ps %xmm0, %ymm0 ; ALL-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> %2 = fpext <8 x half> %1 to <8 x float> @@ -129,252 +55,23 @@ define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { ; AVX1-LABEL: cvt_16i16_to_16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %r10 -; AVX1-NEXT: movq %r10, %r8 -; AVX1-NEXT: shrq $32, %r8 -; AVX1-NEXT: vpextrq $1, %xmm1, %rdx -; AVX1-NEXT: movq %rdx, %r9 -; AVX1-NEXT: shrq $32, %r9 -; AVX1-NEXT: vmovq %xmm0, %rdi -; AVX1-NEXT: movq %rdi, %r11 -; AVX1-NEXT: shrq $32, %r11 -; AVX1-NEXT: vpextrq $1, %xmm0, %rsi -; AVX1-NEXT: movq %rsi, %rax -; AVX1-NEXT: shrq $32, %rax -; AVX1-NEXT: movl %esi, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: movzwl %si, %ecx -; AVX1-NEXT: vmovd %ecx, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX1-NEXT: shrq $48, %rsi -; AVX1-NEXT: vmovd %esi, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: movzwl %di, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: movzwl %r11w, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: shrq $48, %rdi -; AVX1-NEXT: vmovd %edi, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: movzwl %dx, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: movzwl %r9w, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: shrq $48, %rdx -; AVX1-NEXT: vmovd %edx, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: movl %r10d, %eax -; AVX1-NEXT: shrl $16, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: movzwl %r10w, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX1-NEXT: movzwl %r8w, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX1-NEXT: shrq $48, %r10 -; AVX1-NEXT: vmovd %r10d, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtph2ps %xmm0, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vcvtph2ps %xmm0, %ymm1 +; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: cvt_16i16_to_16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %r10 -; AVX2-NEXT: movq %r10, %r8 -; AVX2-NEXT: shrq $32, %r8 -; AVX2-NEXT: vpextrq $1, %xmm1, %rdx -; AVX2-NEXT: movq %rdx, %r9 -; AVX2-NEXT: shrq $32, %r9 -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: movq %rdi, %r11 -; AVX2-NEXT: shrq $32, %r11 -; AVX2-NEXT: vpextrq $1, %xmm0, %rsi -; AVX2-NEXT: movq %rsi, %rax -; AVX2-NEXT: shrq $32, %rax -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: shrl $16, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: movzwl %si, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX2-NEXT: shrq $48, %rsi -; AVX2-NEXT: vmovd %esi, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: movzwl %di, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX2-NEXT: movzwl %r11w, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX2-NEXT: shrq $48, %rdi -; AVX2-NEXT: vmovd %edi, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: movl %edx, %eax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: movzwl %dx, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX2-NEXT: movzwl %r9w, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX2-NEXT: shrq $48, %rdx -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: movl %r10d, %eax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: movzwl %r10w, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-NEXT: movzwl %r8w, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: shrq $48, %r10 -; AVX2-NEXT: vmovd %r10d, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vcvtph2ps %xmm0, %ymm2 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vcvtph2ps %xmm0, %ymm1 +; AVX2-NEXT: vmovaps %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_16i16_to_16f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm0, %r10 -; AVX512-NEXT: movq %r10, %r8 -; AVX512-NEXT: shrq $32, %r8 -; AVX512-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512-NEXT: movq %rdx, %r9 -; AVX512-NEXT: shrq $32, %r9 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rdi -; AVX512-NEXT: movq %rdi, %r11 -; AVX512-NEXT: shrq $32, %r11 -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512-NEXT: movq %rsi, %rax -; AVX512-NEXT: shrq $32, %rax -; AVX512-NEXT: movl %esi, %ecx -; AVX512-NEXT: shrl $16, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: movzwl %si, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512-NEXT: movzwl %ax, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512-NEXT: shrq $48, %rsi -; AVX512-NEXT: vmovd %esi, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512-NEXT: movl %edi, %eax -; AVX512-NEXT: shrl $16, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: movzwl %di, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512-NEXT: movzwl %r11w, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512-NEXT: shrq $48, %rdi -; AVX512-NEXT: vmovd %edi, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: movl %edx, %eax -; AVX512-NEXT: shrl $16, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: movzwl %dx, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512-NEXT: movzwl %r9w, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512-NEXT: shrq $48, %rdx -; AVX512-NEXT: vmovd %edx, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX512-NEXT: movl %r10d, %eax -; AVX512-NEXT: shrl $16, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: movzwl %r10w, %eax -; AVX512-NEXT: vmovd %eax, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512-NEXT: movzwl %r8w, %eax -; AVX512-NEXT: vmovd %eax, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512-NEXT: shrq $48, %r10 -; AVX512-NEXT: vmovd %r10d, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vcvtph2ps %ymm0, %zmm0 ; AVX512-NEXT: retq %1 = bitcast <16 x i16> %a0 to <16 x half> %2 = fpext <16 x half> %1 to <16 x float> @@ -401,29 +98,7 @@ define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_4i16_to_4f32: ; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: movl 4(%rdi), %ecx -; ALL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 -; ALL-NEXT: vpextrw $1, %xmm1, %eax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; ALL-NEXT: vpextrw $1, %xmm0, %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; ALL-NEXT: vcvtph2ps (%rdi), %xmm0 ; ALL-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a0 %2 = bitcast <4 x i16> %1 to <4 x half> @@ -434,25 +109,7 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_8i16_to_4f32: ; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: movl %eax, %edx -; ALL-NEXT: shrl $16, %edx -; ALL-NEXT: vmovd %edx, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: movzwl %ax, %edx -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; ALL-NEXT: shrq $48, %rax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; ALL-NEXT: vcvtph2ps (%rdi), %xmm0 ; ALL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> @@ -464,53 +121,7 @@ define <8 x float> @load_cvt_8i16_to_8f32(<8 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_8i16_to_8f32: ; ALL: # %bb.0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: movl 4(%rdi), %ecx -; ALL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: movl 12(%rdi), %eax -; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: movl 8(%rdi), %eax -; ALL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; ALL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 -; ALL-NEXT: vpextrw $1, %xmm3, %eax -; ALL-NEXT: vmovd %eax, %xmm4 -; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 -; ALL-NEXT: vmovd %xmm3, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] -; ALL-NEXT: vmovd %xmm2, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm4 -; ALL-NEXT: vcvtph2ps %xmm4, %xmm4 -; ALL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; ALL-NEXT: vpextrw $1, %xmm2, %eax -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] -; ALL-NEXT: vpextrw $1, %xmm1, %eax -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2,3] -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movzwl %ax, %eax -; ALL-NEXT: vmovd %eax, %xmm3 -; ALL-NEXT: vcvtph2ps %xmm3, %xmm3 -; ALL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; ALL-NEXT: vpextrw $1, %xmm0, %eax -; ALL-NEXT: vmovd %eax, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; ALL-NEXT: vcvtph2ps (%rdi), %ymm0 ; ALL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half> @@ -521,405 +132,20 @@ define <16 x float> @load_cvt_16i16_to_16f32(<16 x i16>* %a0) nounwind { ; AVX1-LABEL: load_cvt_16i16_to_16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rax -; AVX1-NEXT: movl 20(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl 16(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl 28(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl 24(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl (%rdi), %eax -; AVX1-NEXT: movl 4(%rdi), %ecx -; AVX1-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl 12(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: movl 8(%rdi), %eax -; AVX1-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 -; AVX1-NEXT: vpextrw $1, %xmm7, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm7, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] -; AVX1-NEXT: vmovd %xmm6, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm7 -; AVX1-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] -; AVX1-NEXT: vpextrw $1, %xmm6, %eax -; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[0] -; AVX1-NEXT: vpextrw $1, %xmm5, %eax -; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: vmovd %xmm5, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm5 -; AVX1-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm6 -; AVX1-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX1-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpextrw $1, %xmm4, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm4, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm4 -; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] -; AVX1-NEXT: vmovd %xmm3, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm4 -; AVX1-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-NEXT: vpextrw $1, %xmm3, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; AVX1-NEXT: vpextrw $1, %xmm2, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm2, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX1-NEXT: vmovd %xmm8, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX1-NEXT: vpextrw $1, %xmm8, %eax -; AVX1-NEXT: vmovd %eax, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: popq %rax +; AVX1-NEXT: vcvtph2ps (%rdi), %ymm0 +; AVX1-NEXT: vcvtph2ps 16(%rdi), %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_cvt_16i16_to_16f32: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rax -; AVX2-NEXT: movl 20(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl 16(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl 28(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl 24(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl (%rdi), %eax -; AVX2-NEXT: movl 4(%rdi), %ecx -; AVX2-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl 12(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: movl 8(%rdi), %eax -; AVX2-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm0 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 -; AVX2-NEXT: vpextrw $1, %xmm7, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm7, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[2,3] -; AVX2-NEXT: vmovd %xmm6, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm7 -; AVX2-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] -; AVX2-NEXT: vpextrw $1, %xmm6, %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[0] -; AVX2-NEXT: vpextrw $1, %xmm5, %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: vmovd %xmm5, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm5 -; AVX2-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm6 -; AVX2-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpextrw $1, %xmm4, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm4, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm4 -; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] -; AVX2-NEXT: vmovd %xmm3, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm4 -; AVX2-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX2-NEXT: vpextrw $1, %xmm3, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; AVX2-NEXT: vpextrw $1, %xmm2, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vmovd %xmm2, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; AVX2-NEXT: vmovd %xmm8, %eax -; AVX2-NEXT: movzwl %ax, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: vpextrw $1, %xmm8, %eax -; AVX2-NEXT: vmovd %eax, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-NEXT: popq %rax +; AVX2-NEXT: vcvtph2ps (%rdi), %ymm0 +; AVX2-NEXT: vcvtph2ps 16(%rdi), %ymm1 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: load_cvt_16i16_to_16f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rax -; AVX512F-NEXT: movl (%rdi), %eax -; AVX512F-NEXT: movl 4(%rdi), %ecx -; AVX512F-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 12(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 8(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 20(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 16(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 28(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: movl 24(%rdi), %eax -; AVX512F-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 -; AVX512F-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 -; AVX512F-NEXT: vpextrw $1, %xmm7, %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm7, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 -; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[2,3] -; AVX512F-NEXT: vmovd %xmm6, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm7 -; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0],xmm0[3] -; AVX512F-NEXT: vpextrw $1, %xmm6, %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[0] -; AVX512F-NEXT: vpextrw $1, %xmm5, %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512F-NEXT: vmovd %xmm5, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm5 -; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm6 -; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] -; AVX512F-NEXT: vpextrw $1, %xmm4, %eax -; AVX512F-NEXT: vmovd %eax, %xmm4 -; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 -; AVX512F-NEXT: vpextrw $1, %xmm3, %eax -; AVX512F-NEXT: vmovd %eax, %xmm0 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[2,3] -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm3 -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX512F-NEXT: vpextrw $1, %xmm2, %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] -; AVX512F-NEXT: vpextrw $1, %xmm1, %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm1 -; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512F-NEXT: vmovd %xmm8, %eax -; AVX512F-NEXT: movzwl %ax, %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vpextrw $1, %xmm8, %eax -; AVX512F-NEXT: vmovd %eax, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512F-NEXT: popq %rax -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: load_cvt_16i16_to_16f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: pushq %rax -; AVX512VL-NEXT: movl (%rdi), %eax -; AVX512VL-NEXT: movl 4(%rdi), %ecx -; AVX512VL-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 12(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 8(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 20(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 16(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 28(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: movl 24(%rdi), %eax -; AVX512VL-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm8 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm3 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm4 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm5 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm6 -; AVX512VL-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm7 -; AVX512VL-NEXT: vpextrw $1, %xmm7, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm7, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[2,3] -; AVX512VL-NEXT: vmovd %xmm6, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm7 -; AVX512VL-NEXT: vcvtph2ps %xmm7, %xmm7 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0],xmm0[3] -; AVX512VL-NEXT: vpextrw $1, %xmm6, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[0] -; AVX512VL-NEXT: vpextrw $1, %xmm5, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: vmovd %xmm5, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm5 -; AVX512VL-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] -; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm6 -; AVX512VL-NEXT: vcvtph2ps %xmm6, %xmm6 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0],xmm5[3] -; AVX512VL-NEXT: vpextrw $1, %xmm4, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 -; AVX512VL-NEXT: vpextrw $1, %xmm3, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: vmovd %xmm3, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[2,3] -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm4 -; AVX512VL-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; AVX512VL-NEXT: vpextrw $1, %xmm2, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm2 -; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] -; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm1 -; AVX512VL-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2,3] -; AVX512VL-NEXT: vmovd %xmm8, %eax -; AVX512VL-NEXT: movzwl %ax, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX512VL-NEXT: vpextrw $1, %xmm8, %eax -; AVX512VL-NEXT: vmovd %eax, %xmm3 -; AVX512VL-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512VL-NEXT: popq %rax -; AVX512VL-NEXT: retq +; AVX512-LABEL: load_cvt_16i16_to_16f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtph2ps (%rdi), %zmm0 +; AVX512-NEXT: retq %1 = load <16 x i16>, <16 x i16>* %a0 %2 = bitcast <16 x i16> %1 to <16 x half> %3 = fpext <16 x half> %2 to <16 x float> @@ -946,16 +172,9 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind { ; ALL-LABEL: cvt_2i16_to_2f64: ; ALL: # %bb.0: -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movzwl %ax, %ecx -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vcvtps2pd %xmm0, %xmm0 ; ALL-NEXT: retq %1 = bitcast <2 x i16> %a0 to <2 x half> %2 = fpext <2 x half> %1 to <2 x double> @@ -965,30 +184,8 @@ define <4 x double> @cvt_4i16_to_4f64(<4 x i16> %a0) nounwind { ; ALL-LABEL: cvt_4i16_to_4f64: ; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movzwl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovd %esi, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtps2pd %xmm0, %ymm0 ; ALL-NEXT: retq %1 = bitcast <4 x i16> %a0 to <4 x half> %2 = fpext <4 x half> %1 to <4 x double> @@ -998,16 +195,9 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind { ; ALL-LABEL: cvt_8i16_to_2f64: ; ALL: # %bb.0: -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movzwl %ax, %ecx -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: vmovd %eax, %xmm0 +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vcvtps2pd %xmm0, %xmm0 ; ALL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> %2 = bitcast <2 x i16> %1 to <2 x half> @@ -1018,30 +208,8 @@ define <4 x double> @cvt_8i16_to_4f64(<8 x i16> %a0) nounwind { ; ALL-LABEL: cvt_8i16_to_4f64: ; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movzwl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: vmovd %edx, %xmm0 ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovd %esi, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtps2pd %xmm0, %ymm0 ; ALL-NEXT: retq %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> %2 = bitcast <4 x i16> %1 to <4 x half> @@ -1052,159 +220,24 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { ; AVX1-LABEL: cvt_8i16_to_8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpextrq $1, %xmm0, %rdx -; AVX1-NEXT: movq %rdx, %r9 -; AVX1-NEXT: movq %rdx, %r10 -; AVX1-NEXT: movzwl %dx, %r8d -; AVX1-NEXT: # kill: def $edx killed $edx killed $rdx -; AVX1-NEXT: shrl $16, %edx -; AVX1-NEXT: shrq $32, %r9 -; AVX1-NEXT: shrq $48, %r10 -; AVX1-NEXT: vmovq %xmm0, %rdi -; AVX1-NEXT: movq %rdi, %rsi -; AVX1-NEXT: movq %rdi, %rax -; AVX1-NEXT: movzwl %di, %ecx -; AVX1-NEXT: # kill: def $edi killed $edi killed $rdi -; AVX1-NEXT: shrl $16, %edi -; AVX1-NEXT: shrq $32, %rsi -; AVX1-NEXT: shrq $48, %rax -; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: movzwl %si, %eax -; AVX1-NEXT: vmovd %eax, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovd %edi, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %ecx, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovd %r10d, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: movzwl %r9w, %eax -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vmovd %edx, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %r8d, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtph2ps %xmm0, %ymm1 +; AVX1-NEXT: vcvtps2pd %xmm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: cvt_8i16_to_8f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpextrq $1, %xmm0, %rdx -; AVX2-NEXT: movq %rdx, %r9 -; AVX2-NEXT: movq %rdx, %r10 -; AVX2-NEXT: movzwl %dx, %r8d -; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx -; AVX2-NEXT: shrl $16, %edx -; AVX2-NEXT: shrq $32, %r9 -; AVX2-NEXT: shrq $48, %r10 -; AVX2-NEXT: vmovq %xmm0, %rdi -; AVX2-NEXT: movq %rdi, %rsi -; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: movzwl %di, %ecx -; AVX2-NEXT: # kill: def $edi killed $edi killed $rdi -; AVX2-NEXT: shrl $16, %edi -; AVX2-NEXT: shrq $32, %rsi -; AVX2-NEXT: shrq $48, %rax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: movzwl %si, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovd %edi, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %ecx, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovd %r10d, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: movzwl %r9w, %eax -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vmovd %edx, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %r8d, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vcvtph2ps %xmm0, %ymm1 +; AVX2-NEXT: vcvtps2pd %xmm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_8i16_to_8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm0, %rdx -; AVX512-NEXT: movq %rdx, %r9 -; AVX512-NEXT: movq %rdx, %r10 -; AVX512-NEXT: movzwl %dx, %r8d -; AVX512-NEXT: # kill: def $edx killed $edx killed $rdx -; AVX512-NEXT: shrl $16, %edx -; AVX512-NEXT: shrq $32, %r9 -; AVX512-NEXT: shrq $48, %r10 -; AVX512-NEXT: vpextrq $1, %xmm0, %rdi -; AVX512-NEXT: movq %rdi, %rsi -; AVX512-NEXT: movq %rdi, %rax -; AVX512-NEXT: movzwl %di, %ecx -; AVX512-NEXT: # kill: def $edi killed $edi killed $rdi -; AVX512-NEXT: shrl $16, %edi -; AVX512-NEXT: shrq $32, %rsi -; AVX512-NEXT: shrq $48, %rax -; AVX512-NEXT: vmovd %eax, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: movzwl %si, %eax -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovd %edi, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %ecx, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovd %r10d, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: movzwl %r9w, %eax -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vmovd %edx, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %r8d, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0 +; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 ; AVX512-NEXT: retq %1 = bitcast <8 x i16> %a0 to <8 x half> %2 = fpext <8 x half> %1 to <8 x double> @@ -1232,15 +265,10 @@ define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_2i16_to_2f64: ; ALL: # %bb.0: -; ALL-NEXT: movzwl (%rdi), %eax -; ALL-NEXT: movzwl 2(%rdi), %ecx -; ALL-NEXT: vmovd %ecx, %xmm0 +; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; ALL-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vcvtps2pd %xmm0, %xmm0 ; ALL-NEXT: retq %1 = load <2 x i16>, <2 x i16>* %a0 %2 = bitcast <2 x i16> %1 to <2 x half> @@ -1251,25 +279,8 @@ define <4 x double> @load_cvt_4i16_to_4f64(<4 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_4i16_to_4f64: ; ALL: # %bb.0: -; ALL-NEXT: movzwl (%rdi), %eax -; ALL-NEXT: movzwl 2(%rdi), %ecx -; ALL-NEXT: movzwl 4(%rdi), %edx -; ALL-NEXT: movzwl 6(%rdi), %esi -; ALL-NEXT: vmovd %esi, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovd %edx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovd %eax, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtph2ps (%rdi), %xmm0 +; ALL-NEXT: vcvtps2pd %xmm0, %ymm0 ; ALL-NEXT: retq %1 = load <4 x i16>, <4 x i16>* %a0 %2 = bitcast <4 x i16> %1 to <4 x half> @@ -1280,30 +291,8 @@ define <4 x double> @load_cvt_8i16_to_4f64(<8 x i16>* %a0) nounwind { ; ALL-LABEL: load_cvt_8i16_to_4f64: ; ALL: # %bb.0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: movq %rax, %rcx -; ALL-NEXT: movq %rax, %rdx -; ALL-NEXT: movzwl %ax, %esi -; ALL-NEXT: # kill: def $eax killed $eax killed $rax -; ALL-NEXT: shrl $16, %eax -; ALL-NEXT: shrq $32, %rcx -; ALL-NEXT: shrq $48, %rdx -; ALL-NEXT: vmovd %edx, %xmm0 -; ALL-NEXT: vcvtph2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: movzwl %cx, %ecx -; ALL-NEXT: vmovd %ecx, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovd %eax, %xmm1 -; ALL-NEXT: vcvtph2ps %xmm1, %xmm1 -; ALL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vmovd %esi, %xmm2 -; ALL-NEXT: vcvtph2ps %xmm2, %xmm2 -; ALL-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; ALL-NEXT: vcvtph2ps (%rdi), %xmm0 +; ALL-NEXT: vcvtps2pd %xmm0, %ymm0 ; ALL-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> @@ -1315,129 +304,24 @@ define <8 x double> @load_cvt_8i16_to_8f64(<8 x i16>* %a0) nounwind { ; AVX1-LABEL: load_cvt_8i16_to_8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: movzwl 8(%rdi), %r8d -; AVX1-NEXT: movzwl 10(%rdi), %r9d -; AVX1-NEXT: movzwl 12(%rdi), %r10d -; AVX1-NEXT: movzwl 14(%rdi), %esi -; AVX1-NEXT: movzwl (%rdi), %eax -; AVX1-NEXT: movzwl 2(%rdi), %ecx -; AVX1-NEXT: movzwl 4(%rdi), %edx -; AVX1-NEXT: movzwl 6(%rdi), %edi -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %edx, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vmovd %ecx, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %eax, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovd %esi, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX1-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %r10d, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vmovd %r9d, %xmm2 -; AVX1-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX1-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %r8d, %xmm3 -; AVX1-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX1-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vcvtph2ps (%rdi), %ymm1 +; AVX1-NEXT: vcvtps2pd %xmm1, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_cvt_8i16_to_8f64: ; AVX2: # %bb.0: -; AVX2-NEXT: movzwl 8(%rdi), %r8d -; AVX2-NEXT: movzwl 10(%rdi), %r9d -; AVX2-NEXT: movzwl 12(%rdi), %r10d -; AVX2-NEXT: movzwl 14(%rdi), %esi -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: movzwl 2(%rdi), %ecx -; AVX2-NEXT: movzwl 4(%rdi), %edx -; AVX2-NEXT: movzwl 6(%rdi), %edi -; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %edx, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vmovd %ecx, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %eax, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovd %esi, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %r10d, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vmovd %r9d, %xmm2 -; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX2-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %r8d, %xmm3 -; AVX2-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX2-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT: vcvtph2ps (%rdi), %ymm1 +; AVX2-NEXT: vcvtps2pd %xmm1, %ymm0 +; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_cvt_8i16_to_8f64: ; AVX512: # %bb.0: -; AVX512-NEXT: movzwl (%rdi), %r8d -; AVX512-NEXT: movzwl 2(%rdi), %r9d -; AVX512-NEXT: movzwl 4(%rdi), %r10d -; AVX512-NEXT: movzwl 6(%rdi), %esi -; AVX512-NEXT: movzwl 8(%rdi), %eax -; AVX512-NEXT: movzwl 10(%rdi), %ecx -; AVX512-NEXT: movzwl 12(%rdi), %edx -; AVX512-NEXT: movzwl 14(%rdi), %edi -; AVX512-NEXT: vmovd %edi, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %edx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vmovd %ecx, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %eax, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovd %esi, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %r10d, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vmovd %r9d, %xmm2 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %r8d, %xmm3 -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512-NEXT: vcvtss2sd %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vcvtph2ps (%rdi), %ymm0 +; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0 ; AVX512-NEXT: retq %1 = load <8 x i16>, <8 x i16>* %a0 %2 = bitcast <8 x i16> %1 to <8 x half> @@ -1464,18 +348,7 @@ define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind { ; ALL-LABEL: cvt_4f32_to_4i16: ; ALL: # %bb.0: -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -1485,18 +358,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind { ; ALL-LABEL: cvt_4f32_to_8i16_undef: ; ALL: # %bb.0: -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -1507,18 +369,7 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { ; ALL-LABEL: cvt_4f32_to_8i16_zero: ; ALL: # %bb.0: -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -1529,44 +380,7 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind { ; ALL-LABEL: cvt_8f32_to_8i16: ; ALL: # %bb.0: -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, %eax -; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %ecx -; ALL-NEXT: shll $16, %ecx -; ALL-NEXT: orl %eax, %ecx -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, %edx -; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %eax -; ALL-NEXT: shll $16, %eax -; ALL-NEXT: orl %edx, %eax -; ALL-NEXT: shlq $32, %rax -; ALL-NEXT: orq %rcx, %rax -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, %ecx -; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vmovd %xmm1, %edx -; ALL-NEXT: shll $16, %edx -; ALL-NEXT: orl %ecx, %edx -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, %ecx -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %esi -; ALL-NEXT: shll $16, %esi -; ALL-NEXT: orl %ecx, %esi -; ALL-NEXT: shlq $32, %rsi -; ALL-NEXT: orq %rdx, %rsi -; ALL-NEXT: vmovq %rsi, %xmm0 -; ALL-NEXT: vmovq %rax, %xmm1 -; ALL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; ALL-NEXT: vcvtps2ph $4, %ymm0, %xmm0 ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %1 = fptrunc <8 x float> %a0 to <8 x half> @@ -1577,222 +391,21 @@ define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { ; AVX1-LABEL: cvt_16f32_to_16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: pushq %rbp -; AVX1-NEXT: pushq %r15 -; AVX1-NEXT: pushq %r14 -; AVX1-NEXT: pushq %r12 -; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm3, %r8d -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm3, %r9d -; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm3, %r10d -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm2, %r11d -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm2, %r14d -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vmovd %xmm2, %r15d -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm2 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %r12d -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm3, %edx -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm3, %esi -; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX1-NEXT: vmovd %xmm3, %ebx -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %ebp -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %edi -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vmovd %xmm1, %ecx -; AVX1-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $4, %ebp, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $5, %ebx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $1, %r12d, %xmm2, %xmm1 -; AVX1-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $4, %r11d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $5, %r10d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $6, %r9d, %xmm1, %xmm1 -; AVX1-NEXT: vpinsrw $7, %r8d, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: popq %rbx -; AVX1-NEXT: popq %r12 -; AVX1-NEXT: popq %r14 -; AVX1-NEXT: popq %r15 -; AVX1-NEXT: popq %rbp +; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX1-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: cvt_16f32_to_16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: pushq %rbp -; AVX2-NEXT: pushq %r15 -; AVX2-NEXT: pushq %r14 -; AVX2-NEXT: pushq %r12 -; AVX2-NEXT: pushq %rbx -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX2-NEXT: vmovd %xmm3, %r8d -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX2-NEXT: vmovd %xmm3, %r9d -; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX2-NEXT: vmovd %xmm3, %r10d -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %r11d -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %r14d -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vmovd %xmm2, %r15d -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm2 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %r12d -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX2-NEXT: vmovd %xmm3, %edx -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX2-NEXT: vmovd %xmm3, %esi -; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX2-NEXT: vmovd %xmm3, %ebx -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %ebp -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %edi -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm0 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovd %xmm1, %ecx -; AVX2-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $4, %ebp, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $5, %ebx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $1, %r12d, %xmm2, %xmm1 -; AVX2-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $4, %r11d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $5, %r10d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $6, %r9d, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $7, %r8d, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: popq %rbx -; AVX2-NEXT: popq %r12 -; AVX2-NEXT: popq %r14 -; AVX2-NEXT: popq %r15 -; AVX2-NEXT: popq %rbp +; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX2-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: cvt_16f32_to_16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: pushq %rbp -; AVX512-NEXT: pushq %r15 -; AVX512-NEXT: pushq %r14 -; AVX512-NEXT: pushq %r12 -; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm2, %r8d -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm2, %r9d -; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm2, %r10d -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm1, %r11d -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm1, %r14d -; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovd %xmm1, %r15d -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm2, %r12d -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512-NEXT: vmovd %xmm3, %edx -; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512-NEXT: vmovd %xmm3, %esi -; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512-NEXT: vmovd %xmm3, %ebx -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm2, %ebp -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm2, %edi -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovd %xmm2, %eax -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %ecx -; AVX512-NEXT: vpinsrw $1, %ecx, %xmm2, %xmm0 -; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $4, %ebp, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $5, %ebx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $6, %esi, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrw $1, %r12d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $2, %r15d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $3, %r14d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $4, %r11d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $5, %r10d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $6, %r9d, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $7, %r8d, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: popq %rbx -; AVX512-NEXT: popq %r12 -; AVX512-NEXT: popq %r14 -; AVX512-NEXT: popq %r15 -; AVX512-NEXT: popq %rbp +; AVX512-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512-NEXT: retq %1 = fptrunc <16 x float> %a0 to <16 x half> %2 = bitcast <16 x half> %1 to <16 x i16> @@ -1818,17 +431,7 @@ define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_4f32_to_4i16: ; ALL: # %bb.0: -; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vpextrw $0, %xmm0, (%rdi) -; ALL-NEXT: vpextrw $0, %xmm3, 6(%rdi) -; ALL-NEXT: vpextrw $0, %xmm2, 4(%rdi) -; ALL-NEXT: vpextrw $0, %xmm1, 2(%rdi) +; ALL-NEXT: vcvtps2ph $4, %xmm0, (%rdi) ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -1839,18 +442,7 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_4f32_to_8i16_undef: ; ALL: # %bb.0: -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 ; ALL-NEXT: vmovaps %xmm0, (%rdi) ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> @@ -1863,18 +455,7 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_4f32_to_8i16_zero: ; ALL: # %bb.0: -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpextrw $0, %xmm1, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vpextrw $0, %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovaps %xmm0, (%rdi) ; ALL-NEXT: retq %1 = fptrunc <4 x float> %a0 to <4 x half> @@ -1887,29 +468,7 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind { ; ALL-LABEL: store_cvt_8f32_to_8i16: ; ALL: # %bb.0: -; ALL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; ALL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; ALL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; ALL-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm4 -; ALL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] -; ALL-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; ALL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm4[1,0] -; ALL-NEXT: vcvtps2ph $4, %xmm6, %xmm6 -; ALL-NEXT: vpermilps {{.*#+}} xmm7 = xmm4[3,1,2,3] -; ALL-NEXT: vcvtps2ph $4, %xmm7, %xmm7 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; ALL-NEXT: vpextrw $0, %xmm4, 8(%rdi) -; ALL-NEXT: vpextrw $0, %xmm0, (%rdi) -; ALL-NEXT: vpextrw $0, %xmm7, 14(%rdi) -; ALL-NEXT: vpextrw $0, %xmm6, 12(%rdi) -; ALL-NEXT: vpextrw $0, %xmm5, 10(%rdi) -; ALL-NEXT: vpextrw $0, %xmm3, 6(%rdi) -; ALL-NEXT: vpextrw $0, %xmm2, 4(%rdi) -; ALL-NEXT: vpextrw $0, %xmm1, 2(%rdi) +; ALL-NEXT: vcvtps2ph $4, %ymm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %1 = fptrunc <8 x float> %a0 to <8 x half> @@ -1921,155 +480,21 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwind { ; AVX1-LABEL: store_cvt_16f32_to_16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm8 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm9 -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm5 = xmm6[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm5, %xmm11 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm7, %xmm12 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm13 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm14 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm15 -; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm1[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm7[1,1,3,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm7[1,0] -; AVX1-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm7[3,1,2,3] -; AVX1-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX1-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT: vcvtps2ph $4, %xmm6, %xmm6 -; AVX1-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT: vcvtps2ph $4, %xmm7, %xmm7 -; AVX1-NEXT: vpextrw $0, %xmm7, 24(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm1, 16(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm6, 8(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: vpextrw $0, %xmm4, 30(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm3, 28(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm2, 26(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm5, 22(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm15, 20(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm14, 18(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm13, 14(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm12, 12(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm11, 10(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm10, 6(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm9, 4(%rdi) -; AVX1-NEXT: vpextrw $0, %xmm8, 2(%rdi) +; AVX1-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) +; AVX1-NEXT: vcvtps2ph $4, %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: store_cvt_16f32_to_16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm8 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm9 -; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm10 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm5 = xmm6[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm5, %xmm11 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm7, %xmm12 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm13 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm14 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm15 -; AVX2-NEXT: vpermilps {{.*#+}} xmm5 = xmm1[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm7[1,1,3,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm7[1,0] -; AVX2-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm7[3,1,2,3] -; AVX2-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX2-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT: vcvtps2ph $4, %xmm6, %xmm6 -; AVX2-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT: vcvtps2ph $4, %xmm7, %xmm7 -; AVX2-NEXT: vpextrw $0, %xmm7, 24(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm1, 16(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm6, 8(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: vpextrw $0, %xmm4, 30(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm3, 28(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm2, 26(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm5, 22(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm15, 20(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm14, 18(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm13, 14(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm12, 12(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm11, 10(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm10, 6(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm9, 4(%rdi) -; AVX2-NEXT: vpextrw $0, %xmm8, 2(%rdi) +; AVX2-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) +; AVX2-NEXT: vcvtps2ph $4, %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX512-LABEL: store_cvt_16f32_to_16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm9 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm10 -; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm11 -; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm6[1,1,3,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm12 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm5 = xmm6[1,0] -; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm13 -; AVX512-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[3,1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm7, %xmm14 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm8 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm8[1,1,3,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm15 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm8[1,0] -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vpermilps {{.*#+}} xmm3 = xmm8[3,1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512-NEXT: vextractf128 $1, %ymm8, %xmm4 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm4[1,1,3,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm5 -; AVX512-NEXT: vpermilpd {{.*#+}} xmm7 = xmm4[1,0] -; AVX512-NEXT: vcvtps2ph $4, %xmm7, %xmm7 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[3,1,2,3] -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vcvtps2ph $4, %xmm6, %xmm6 -; AVX512-NEXT: vcvtps2ph $4, %xmm8, %xmm8 -; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX512-NEXT: vpextrw $0, %xmm4, 24(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm8, 16(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm6, 8(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX512-NEXT: vpextrw $0, %xmm1, 30(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm7, 28(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm5, 26(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm3, 22(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm2, 20(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm15, 18(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm14, 14(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm13, 12(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm12, 10(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm11, 6(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm10, 4(%rdi) -; AVX512-NEXT: vpextrw $0, %xmm9, 2(%rdi) +; AVX512-NEXT: vcvtps2ph $4, %zmm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = fptrunc <16 x float> %a0 to <16 x half>