diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -15002,9 +15002,10 @@ SDValue V2, ArrayRef Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + MVT EltVT = VT.getVectorElementType(); if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || - (Subtarget.hasAVX() && VT.isFloatingPoint()) || - (Subtarget.hasAVX2() && VT.isInteger()))) + (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || + (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16)))) return SDValue(); // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -2110,4 +2110,113 @@ ret <8 x half> %3 } +define void @pr63114() { +; CHECK-LIBCALL-LABEL: pr63114: +; CHECK-LIBCALL: # %bb.0: +; CHECK-LIBCALL-NEXT: movdqu (%rax), %xmm4 +; CHECK-LIBCALL-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,3,4,5,6,7] +; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm0 +; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] +; CHECK-LIBCALL-NEXT: por %xmm2, %xmm0 +; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] +; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm0 +; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] +; CHECK-LIBCALL-NEXT: por %xmm5, %xmm0 +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,7,7] +; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm6 +; CHECK-LIBCALL-NEXT: por %xmm2, %xmm6 +; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm6 +; CHECK-LIBCALL-NEXT: por %xmm5, %xmm6 +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,5,5,5] +; CHECK-LIBCALL-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3,0,3] +; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] +; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm4 +; CHECK-LIBCALL-NEXT: por %xmm2, %xmm4 +; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm4 +; CHECK-LIBCALL-NEXT: por %xmm5, %xmm4 +; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm7 +; CHECK-LIBCALL-NEXT: por %xmm2, %xmm7 +; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm7 +; CHECK-LIBCALL-NEXT: por %xmm5, %xmm7 +; CHECK-LIBCALL-NEXT: movdqu %xmm7, 0 +; CHECK-LIBCALL-NEXT: movdqu %xmm4, 32 +; CHECK-LIBCALL-NEXT: movdqu %xmm6, 48 +; CHECK-LIBCALL-NEXT: movdqu %xmm0, 16 +; CHECK-LIBCALL-NEXT: retq +; +; BWON-F16C-LABEL: pr63114: +; BWON-F16C: # %bb.0: +; BWON-F16C-NEXT: vmovdqu (%rax), %xmm0 +; BWON-F16C-NEXT: vpsrld $16, %xmm0, %xmm1 +; BWON-F16C-NEXT: vbroadcastss (%rax), %xmm2 +; BWON-F16C-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; BWON-F16C-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0] +; BWON-F16C-NEXT: vpinsrw $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 +; BWON-F16C-NEXT: vpsllq $48, %xmm3, %xmm4 +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7] +; BWON-F16C-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1] +; BWON-F16C-NEXT: vpor %xmm3, %xmm2, %xmm2 +; BWON-F16C-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3],xmm1[2,0] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7] +; BWON-F16C-NEXT: vpor %xmm3, %xmm1, %xmm1 +; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; BWON-F16C-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,3,3,4,5,6,7] +; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,1] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] +; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3],xmm0[4,5,6,7] +; BWON-F16C-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm3[7] +; BWON-F16C-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; BWON-F16C-NEXT: vmovups %ymm0, 0 +; BWON-F16C-NEXT: vmovups %ymm1, 32 +; BWON-F16C-NEXT: vzeroupper +; BWON-F16C-NEXT: retq +; +; CHECK-I686-LABEL: pr63114: +; CHECK-I686: # %bb.0: +; CHECK-I686-NEXT: movdqu (%eax), %xmm6 +; CHECK-I686-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,3,4,5,6,7] +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] +; CHECK-I686-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] +; CHECK-I686-NEXT: pand %xmm1, %xmm0 +; CHECK-I686-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] +; CHECK-I686-NEXT: por %xmm2, %xmm0 +; CHECK-I686-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] +; CHECK-I686-NEXT: pand %xmm3, %xmm0 +; CHECK-I686-NEXT: movdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60] +; CHECK-I686-NEXT: por %xmm4, %xmm0 +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,4,5,7,7] +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; CHECK-I686-NEXT: pand %xmm1, %xmm5 +; CHECK-I686-NEXT: por %xmm2, %xmm5 +; CHECK-I686-NEXT: pand %xmm3, %xmm5 +; CHECK-I686-NEXT: por %xmm4, %xmm5 +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3,0,3] +; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5] +; CHECK-I686-NEXT: pand %xmm1, %xmm6 +; CHECK-I686-NEXT: por %xmm2, %xmm6 +; CHECK-I686-NEXT: pand %xmm3, %xmm6 +; CHECK-I686-NEXT: por %xmm4, %xmm6 +; CHECK-I686-NEXT: pand %xmm1, %xmm7 +; CHECK-I686-NEXT: por %xmm2, %xmm7 +; CHECK-I686-NEXT: pand %xmm3, %xmm7 +; CHECK-I686-NEXT: por %xmm4, %xmm7 +; CHECK-I686-NEXT: movdqu %xmm7, 0 +; CHECK-I686-NEXT: movdqu %xmm6, 32 +; CHECK-I686-NEXT: movdqu %xmm5, 48 +; CHECK-I686-NEXT: movdqu %xmm0, 16 +; CHECK-I686-NEXT: retl + %1 = load <24 x half>, ptr poison, align 2 + %2 = shufflevector <24 x half> %1, <24 x half> poison, <8 x i32> + %3 = shufflevector <8 x half> %2, <8 x half> , <16 x i32> + %4 = shufflevector <16 x half> poison, <16 x half> %3, <32 x i32> + store <32 x half> %4, ptr null, align 2 + ret void +} + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -932,56 +932,107 @@ } define <8 x float> @load_cvt_8i16_to_8f32(ptr %a0) nounwind { -; AVX-LABEL: load_cvt_8i16_to_8f32: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $48, %rsp -; AVX-NEXT: movq %rdi, %rbx -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 -; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 -; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpinsrw $0, 4(%rbx), %xmm0, %xmm0 -; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX-NEXT: callq __extendhfsf2@PLT -; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX-NEXT: addq $48, %rsp -; AVX-NEXT: popq %rbx -; AVX-NEXT: retq +; AVX1-LABEL: load_cvt_8i16_to_8f32: +; AVX1: # %bb.0: +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: subq $48, %rsp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vbroadcastss 12(%rbx), %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vbroadcastss 4(%rbx), %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX1-NEXT: callq __extendhfsf2@PLT +; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX1-NEXT: addq $48, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_cvt_8i16_to_8f32: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: subq $48, %rsp +; AVX2-NEXT: movq %rdi, %rbx +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpinsrw $0, 4(%rbx), %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX2-NEXT: callq __extendhfsf2@PLT +; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX2-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX2-NEXT: addq $48, %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: retq ; ; F16C-LABEL: load_cvt_8i16_to_8f32: ; F16C: # %bb.0: @@ -1004,9 +1055,9 @@ ; AVX1-NEXT: pushq %rbx ; AVX1-NEXT: subq $80, %rsp ; AVX1-NEXT: movq %rdi, %rbx -; AVX1-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm0 ; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vmovdqa (%rbx), %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovaps 16(%rbx), %xmm0 @@ -1016,7 +1067,7 @@ ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss 12(%rbx), %xmm0 ; AVX1-NEXT: callq __extendhfsf2@PLT ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] @@ -1036,7 +1087,7 @@ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpinsrw $0, 4(%rbx), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss 4(%rbx), %xmm0 ; AVX1-NEXT: callq __extendhfsf2@PLT ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] @@ -1047,18 +1098,18 @@ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpinsrw $0, 24(%rbx), %xmm0, %xmm0 +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vbroadcastss 24(%rbx), %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: callq __extendhfsf2@PLT ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpinsrw $0, 28(%rbx), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss 28(%rbx), %xmm0 ; AVX1-NEXT: callq __extendhfsf2@PLT ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] @@ -1078,7 +1129,7 @@ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpinsrw $0, 20(%rbx), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss 20(%rbx), %xmm0 ; AVX1-NEXT: callq __extendhfsf2@PLT ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] @@ -1958,9 +2009,9 @@ ; AVX1-NEXT: pushq %rbx ; AVX1-NEXT: subq $80, %rsp ; AVX1-NEXT: movq %rdi, %rbx -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vmovaps (%rdi), %xmm0 +; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm0 ; AVX1-NEXT: callq __extendhfsf2@PLT ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1982,8 +2033,8 @@ ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload -; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vpinsrw $0, 12(%rbx), %xmm0, %xmm0 +; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vbroadcastss 12(%rbx), %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __extendhfsf2@PLT ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 @@ -1995,7 +2046,7 @@ ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpinsrw $0, 8(%rbx), %xmm0, %xmm0 +; AVX1-NEXT: vbroadcastss 8(%rbx), %xmm0 ; AVX1-NEXT: callq __extendhfsf2@PLT ; AVX1-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill