Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1140,7 +1140,7 @@ for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); - for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { + for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); @@ -1244,18 +1244,6 @@ setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); - - // FIXME. This commands are available on SSE/AVX2, add relevant patterns. - setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal); } setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); @@ -1515,13 +1503,8 @@ setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64); } - for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { + for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); - if (Subtarget.hasVLX()) { - // FIXME. This commands are available on SSE/AVX2, add relevant patterns. - setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal); - setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal); - } } } @@ -18450,6 +18433,12 @@ if (Ext == ISD::SEXTLOAD && RegSz >= 256) loadRegZize = 128; + // If we don't have BWI we won't be able to create the shuffle needed for + // v8i8->v8i64. + if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && + MemVT == MVT::v8i8) + loadRegZize = 128; + // Represent our vector as a sequence of elements which are the // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT( @@ -18516,6 +18505,13 @@ return Shuff; } + if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && + MemVT == MVT::v8i8) { + SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Sext; + } + // Redistribute the loaded elements into the different locations. SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -7992,46 +7992,6 @@ defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">; defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">; -// EXTLOAD patterns, implemented using vpmovz -multiclass avx512_ext_lowering { - def : Pat<(To.VT (LdFrag addr:$src)), - (!cast("VPMOVZX"#InstrStr#"rm") addr:$src)>; - def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src), To.RC:$src0)), - (!cast("VPMOVZX"#InstrStr#"rmk") To.RC:$src0, - To.KRC:$mask, addr:$src)>; - def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src), - To.ImmAllZerosV)), - (!cast("VPMOVZX"#InstrStr#"rmkz") To.KRC:$mask, - addr:$src)>; -} - -let Predicates = [HasVLX, HasBWI] in { - defm : avx512_ext_lowering<"BWZ128", v8i16x_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"BWZ256", v16i16x_info, v16i8x_info, extloadvi8>; -} -let Predicates = [HasBWI] in { - defm : avx512_ext_lowering<"BWZ", v32i16_info, v32i8x_info, extloadvi8>; -} -let Predicates = [HasVLX, HasAVX512] in { - defm : avx512_ext_lowering<"BDZ128", v4i32x_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"BDZ256", v8i32x_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"BQZ128", v2i64x_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"BQZ256", v4i64x_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"WDZ128", v4i32x_info, v8i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"WDZ256", v8i32x_info, v8i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"WQZ128", v2i64x_info, v8i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"WQZ256", v4i64x_info, v8i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"DQZ128", v2i64x_info, v4i32x_info, extloadvi32>; - defm : avx512_ext_lowering<"DQZ256", v4i64x_info, v4i32x_info, extloadvi32>; -} -let Predicates = [HasAVX512] in { - defm : avx512_ext_lowering<"BDZ", v16i32_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"BQZ", v8i64_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"WDZ", v16i32_info, v16i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"WQZ", v8i64_info, v8i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"DQZ", v8i64_info, v8i32x_info, extloadvi32>; -} multiclass AVX512_pmovx_patterns { Index: test/CodeGen/X86/avx2-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx2-vbroadcast.ll +++ test/CodeGen/X86/avx2-vbroadcast.ll @@ -235,65 +235,33 @@ } define <8 x i16> @broadcast_mem_v4i16_v8i16(<4 x i16>* %ptr) { -; X32-AVX2-LABEL: broadcast_mem_v4i16_v8i16: -; X32-AVX2: ## BB#0: -; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX2-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X32-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: broadcast_mem_v4i16_v8i16: -; X64-AVX2: ## BB#0: -; X64-AVX2-NEXT: vpbroadcastq (%rdi), %xmm0 -; X64-AVX2-NEXT: retq -; -; X32-AVX512VL-LABEL: broadcast_mem_v4i16_v8i16: -; X32-AVX512VL: ## BB#0: -; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X32-AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13] -; X32-AVX512VL-NEXT: retl +; X32-LABEL: broadcast_mem_v4i16_v8i16: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: retl ; -; X64-AVX512VL-LABEL: broadcast_mem_v4i16_v8i16: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13] -; X64-AVX512VL-NEXT: retq +; X64-LABEL: broadcast_mem_v4i16_v8i16: +; X64: ## BB#0: +; X64-NEXT: vpbroadcastq (%rdi), %xmm0 +; X64-NEXT: retq %load = load <4 x i16>, <4 x i16>* %ptr %shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <8 x i32> ret <8 x i16> %shuf } define <16 x i16> @broadcast_mem_v4i16_v16i16(<4 x i16>* %ptr) { -; X32-AVX2-LABEL: broadcast_mem_v4i16_v16i16: -; X32-AVX2: ## BB#0: -; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; X32-AVX2-NEXT: retl -; -; X64-AVX2-LABEL: broadcast_mem_v4i16_v16i16: -; X64-AVX2: ## BB#0: -; X64-AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; X64-AVX2-NEXT: retq -; -; X32-AVX512VL-LABEL: broadcast_mem_v4i16_v16i16: -; X32-AVX512VL: ## BB#0: -; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X32-AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X32-AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X32-AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0 -; X32-AVX512VL-NEXT: retl +; X32-LABEL: broadcast_mem_v4i16_v16i16: +; X32: ## BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: vbroadcastsd %xmm0, %ymm0 +; X32-NEXT: retl ; -; X64-AVX512VL-LABEL: broadcast_mem_v4i16_v16i16: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; X64-AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; X64-AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0 -; X64-AVX512VL-NEXT: retq +; X64-LABEL: broadcast_mem_v4i16_v16i16: +; X64: ## BB#0: +; X64-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-NEXT: retq %load = load <4 x i16>, <4 x i16>* %ptr %shuf = shufflevector <4 x i16> %load, <4 x i16> undef, <16 x i32> ret <16 x i16> %shuf Index: test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll +++ test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll @@ -318,8 +318,7 @@ define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) { ; CHECK-LABEL: test_2xi32_to_4xi32_mem: ; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] +; CHECK-NEXT: vpbroadcastq (%rdi), %xmm0 ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> @@ -328,10 +327,9 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp, <4 x i32> %default) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm1[0,2,0,2] +; CHECK-NEXT: vbroadcasti32x2 (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> @@ -342,10 +340,9 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask0(<2 x i32>* %vp) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[0,2,0,2] +; CHECK-NEXT: vbroadcasti32x2 (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> @@ -355,10 +352,9 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp, <4 x i32> %default) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; CHECK-NEXT: movb $3, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm1[0,2,0,2] +; CHECK-NEXT: vbroadcasti32x2 (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> @@ -369,10 +365,9 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask1(<2 x i32>* %vp) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask1: ; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero ; CHECK-NEXT: movb $3, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[0,2,0,2] +; CHECK-NEXT: vbroadcasti32x2 (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> @@ -382,10 +377,9 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp, <4 x i32> %default) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; CHECK-NEXT: movb $5, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm1[0,2,0,2] +; CHECK-NEXT: vbroadcasti32x2 (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> @@ -396,10 +390,9 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask2(<2 x i32>* %vp) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask2: ; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero ; CHECK-NEXT: movb $5, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[0,2,0,2] +; CHECK-NEXT: vbroadcasti32x2 (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> @@ -409,10 +402,9 @@ define <4 x i32> @test_masked_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp, <4 x i32> %default) { ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero ; CHECK-NEXT: movb $13, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = xmm1[0,2,0,2] +; CHECK-NEXT: vbroadcasti32x2 (%rdi), %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> @@ -423,10 +415,9 @@ define <4 x i32> @test_masked_z_2xi32_to_4xi32_mem_mask3(<2 x i32>* %vp) { ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mem_mask3: ; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero ; CHECK-NEXT: movb $13, %al ; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[0,2,0,2] +; CHECK-NEXT: vbroadcasti32x2 (%rdi), %xmm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp %shuf = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -2645,8 +2645,7 @@ ; ; AVX512VL-LABEL: sitofp_load_2i32_to_2f64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512VL-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -2657,8 +2656,7 @@ ; ; AVX512VLDQ-LABEL: sitofp_load_2i32_to_2f64: ; AVX512VLDQ: # BB#0: -; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VLDQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %ld = load <2 x i32>, <2 x i32> *%a @@ -2982,8 +2980,7 @@ ; ; AVX512VL-LABEL: uitofp_load_2i32_to_2f64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -2997,8 +2994,7 @@ ; ; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64: ; AVX512VLDQ: # BB#0: -; AVX512VLDQ-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VLDQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %ld = load <2 x i32>, <2 x i32> *%a @@ -3015,44 +3011,12 @@ ; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 ; SSE-NEXT: retq ; -; VEX-LABEL: uitofp_load_2i16_to_2f64: -; VEX: # BB#0: -; VEX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_load_2i16_to_2f64: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_load_2i16_to_2f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512VL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_load_2i16_to_2f64: -; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_load_2i16_to_2f64: -; AVX512VLDQ: # BB#0: -; AVX512VLDQ-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VLDQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq +; AVX-LABEL: uitofp_load_2i16_to_2f64: +; AVX: # BB#0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 +; AVX-NEXT: retq %ld = load <2 x i16>, <2 x i16> *%a %cvt = uitofp <2 x i16> %ld to <2 x double> ret <2 x double> %cvt Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -2168,17 +2168,11 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: insert_mem_lo_v4i32: -; AVX2: # BB#0: -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: insert_mem_lo_v4i32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,3] -; AVX512VL-NEXT: retq +; AVX2OR512VL-LABEL: insert_mem_lo_v4i32: +; AVX2OR512VL: # BB#0: +; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2OR512VL-NEXT: retq %a = load <2 x i32>, <2 x i32>* %ptr %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> @@ -2210,17 +2204,11 @@ ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: insert_mem_hi_v4i32: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1OR2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: insert_mem_hi_v4i32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; AVX512VL-NEXT: retq +; AVX-LABEL: insert_mem_hi_v4i32: +; AVX: # BB#0: +; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: retq %a = load <2 x i32>, <2 x i32>* %ptr %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32>