diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36100,8 +36100,9 @@ (V1.getOpcode() == ISD::SCALAR_TO_VECTOR && isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) { Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = - !Subtarget.hasSSE2() && MaskEltSize == 32 ? MVT::v4f32 : MaskVT; + SrcVT = DstVT = MaskEltSize == 16 ? MVT::v8f16 + : !Subtarget.hasSSE2() ? MVT::v4f32 + : MaskVT; return true; } } @@ -36145,11 +36146,14 @@ } // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). - if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) && + if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) || + (MaskEltSize == 16 && Subtarget.hasFP16())) && isUndefOrEqual(Mask[0], 0) && isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + SrcVT = DstVT = MaskEltSize == 16 ? MVT::v8f16 + : !Subtarget.hasSSE2() ? MVT::v4f32 + : MaskVT; return true; } diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1926,3 +1926,32 @@ store <4 x half> %c, <4 x half>* %z ret void } + +define <8 x half> @test21(half %a, half %b, half %c) nounwind { +; X64-LABEL: test21: +; X64: # %bb.0: +; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-NEXT: vmovsh %xmm2, %xmm3, %xmm2 +; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT: vpbroadcastw %xmm1, %xmm1 +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X64-NEXT: retq +; +; X86-LABEL: test21: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X86-NEXT: vpbroadcastw %xmm1, %xmm1 +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X86-NEXT: retl + %1 = insertelement <8 x half> , half %a, i32 0 + %2 = insertelement <8 x half> %1, half %b, i32 1 + %3 = insertelement <8 x half> %2, half %c, i32 2 + ret <8 x half> %3 +}