Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1628,6 +1628,7 @@ setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); + setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); @@ -35498,6 +35499,49 @@ return SDValue(); } +static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + MVT OpVT = N->getSimpleValueType(0); + + if (OpVT.getVectorElementType() == MVT::i1) + return SDValue(); + + SDLoc dl(N); + SDValue Vec = N->getOperand(0); + SDValue Idx = N->getOperand(1); + + unsigned IdxVal = cast(Idx)->getZExtValue(); + + // Combine an extract_subvector of an extract into a single extract_subvector. + if (Vec.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + unsigned OtherIdxVal = + cast(Vec.getOperand(1))->getZExtValue(); + + // Only do this combine if both indices are non-zero or both zero. If one is + // zero we should favor the subreg operation first. + // TODO are there other cases that would be good to handle? + if (IdxVal == 0 && OtherIdxVal == 0) { + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, + Vec.getOperand(0), N->getOperand(1)); + } + } + + // If we're only using the lower part of an operation. Try to narrow the Op. + if ((Vec.getOpcode() == ISD::ADD || Vec.getOpcode() == ISD::SUB) && + Vec.hasOneUse() && IdxVal == 0) { + SDValue LHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, + Vec.getOperand(0), Idx); + SDValue RHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, + Vec.getOperand(1), Idx); + return DAG.getNode(ISD::ADD, dl, OpVT, LHS, RHS); + } + + return SDValue(); +} + static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -35518,6 +35562,18 @@ unsigned IdxVal = cast(Idx)->getZExtValue(); MVT SubVecVT = SubVec.getSimpleValueType(); + // Look for two subregister inserts in a row and combine them into a single + // operation. + if (IdxVal == 0 && Vec.isUndef() && + SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && + SubVec.getOperand(0).isUndef()) { + auto *Idx2 = dyn_cast(SubVec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, + SubVec.getOperand(1), Idx); + } + } + // If this is an insert of an extract, combine to a shuffle. Don't do this // if the insert or extract can be represented with a subregister operation. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && @@ -35609,6 +35665,8 @@ return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: return combineInsertSubvector(N, DAG, DCI, Subtarget); + case ISD::EXTRACT_SUBVECTOR: + return combineExtractSubvector(N, DAG, DCI); case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3958,10 +3958,7 @@ def : Pat<(masked_store addr:$dst, Mask, (_.info512.VT (insert_subvector undef, - (_.info256.VT (insert_subvector undef, - (_.info128.VT _.info128.RC:$src), - (iPTR 0))), - (iPTR 0)))), + (_.info128.VT _.info128.RC:$src), (iPTR 0)))), (!cast(InstrStr#mrk) addr:$dst, (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; @@ -3975,10 +3972,7 @@ def : Pat<(masked_store addr:$dst, Mask, (_.info512.VT (insert_subvector undef, - (_.info256.VT (insert_subvector undef, - (_.info128.VT _.info128.RC:$src), - (iPTR 0))), - (iPTR 0)))), + (_.info128.VT _.info128.RC:$src), (iPTR 0)))), (!cast(InstrStr#mrk) addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; @@ -4000,9 +3994,7 @@ def : Pat<(_.info128.VT (extract_subvector (_.info512.VT (masked_load addr:$srcAddr, Mask, (_.info512.VT (insert_subvector undef, - (_.info256.VT (insert_subvector undef, - (_.info128.VT (X86vzmovl _.info128.RC:$src)), - (iPTR 0))), + (_.info128.VT (X86vzmovl _.info128.RC:$src)), (iPTR 0))))), (iPTR 0))), (!cast(InstrStr#rmk) _.info128.RC:$src, @@ -4028,9 +4020,7 @@ def : Pat<(_.info128.VT (extract_subvector (_.info512.VT (masked_load addr:$srcAddr, Mask, (_.info512.VT (insert_subvector undef, - (_.info256.VT (insert_subvector undef, - (_.info128.VT (X86vzmovl _.info128.RC:$src)), - (iPTR 0))), + (_.info128.VT (X86vzmovl _.info128.RC:$src)), (iPTR 0))))), (iPTR 0))), (!cast(InstrStr#rmk) _.info128.RC:$src, Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2985,7 +2985,6 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512: ; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} @@ -3006,7 +3005,6 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512: ; CHECK: ## BB#0: -; CHECK-NEXT: ## kill: %XMM1 %XMM1 %ZMM1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -330,7 +330,7 @@ ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovd %xmm0, %eax ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq Index: test/CodeGen/X86/sad.ll =================================================================== --- test/CodeGen/X86/sad.ll +++ test/CodeGen/X86/sad.ll @@ -79,7 +79,7 @@ ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -105,7 +105,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -328,7 +328,7 @@ ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -356,7 +356,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -801,7 +801,7 @@ ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, %eax ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -830,7 +830,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, %eax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq Index: test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -261,10 +261,10 @@ ; ; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 @@ -277,10 +277,10 @@ ; ; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm0, %xmm2 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 Index: test/CodeGen/X86/vector-half-conversions.ll =================================================================== --- test/CodeGen/X86/vector-half-conversions.ll +++ test/CodeGen/X86/vector-half-conversions.ll @@ -4146,7 +4146,10 @@ ; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm4 +; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm4 +; AVX512F-NEXT: movw %ax, (%rdi) ; AVX512F-NEXT: vmovd %xmm4, %eax ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm4 ; AVX512F-NEXT: movw %ax, 24(%rdi) @@ -4154,19 +4157,32 @@ ; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm4 ; AVX512F-NEXT: movw %ax, 16(%rdi) ; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm4 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[3,1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4 ; AVX512F-NEXT: movw %ax, 8(%rdi) ; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4 -; AVX512F-NEXT: movw %ax, (%rdi) +; AVX512F-NEXT: movw %ax, 6(%rdi) ; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: movw %ax, 4(%rdi) +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm3[3,1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: movw %ax, 2(%rdi) +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm3[1,0] +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 ; AVX512F-NEXT: movw %ax, 30(%rdi) -; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm4, %ymm4 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 +; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] ; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3 ; AVX512F-NEXT: movw %ax, 28(%rdi) @@ -4175,37 +4191,21 @@ ; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3 ; AVX512F-NEXT: movw %ax, 26(%rdi) ; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3 -; AVX512F-NEXT: movw %ax, 22(%rdi) -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm3, %ymm3 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: movw %ax, 20(%rdi) +; AVX512F-NEXT: movw %ax, 22(%rdi) ; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 +; AVX512F-NEXT: movw %ax, 20(%rdi) +; AVX512F-NEXT: vmovd %xmm3, %eax ; AVX512F-NEXT: movw %ax, 18(%rdi) -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512F-NEXT: vcvtps2ph $4, %zmm1, %ymm1 -; AVX512F-NEXT: movw %ax, 14(%rdi) ; AVX512F-NEXT: vmovd %xmm1, %eax -; AVX512F-NEXT: movw %ax, 12(%rdi) -; AVX512F-NEXT: vmovd %xmm2, %eax -; AVX512F-NEXT: movw %ax, 10(%rdi) +; AVX512F-NEXT: movw %ax, 14(%rdi) ; AVX512F-NEXT: vmovd %xmm0, %eax -; AVX512F-NEXT: movw %ax, 6(%rdi) -; AVX512F-NEXT: vmovd %xmm3, %eax -; AVX512F-NEXT: movw %ax, 4(%rdi) +; AVX512F-NEXT: movw %ax, 12(%rdi) ; AVX512F-NEXT: vmovd %xmm4, %eax -; AVX512F-NEXT: movw %ax, 2(%rdi) +; AVX512F-NEXT: movw %ax, 10(%rdi) ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -4214,7 +4214,10 @@ ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm2 ; AVX512VL-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm4 +; AVX512VL-NEXT: vmovd %xmm4, %eax ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm4 +; AVX512VL-NEXT: movw %ax, (%rdi) ; AVX512VL-NEXT: vmovd %xmm4, %eax ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm4 ; AVX512VL-NEXT: movw %ax, 24(%rdi) @@ -4222,19 +4225,32 @@ ; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm4 ; AVX512VL-NEXT: movw %ax, 16(%rdi) ; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm4 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4 ; AVX512VL-NEXT: movw %ax, 8(%rdi) ; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX512VL-NEXT: movw %ax, (%rdi) +; AVX512VL-NEXT: movw %ax, 6(%rdi) ; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: movw %ax, 4(%rdi) +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm3[3,1,2,3] +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: movw %ax, 2(%rdi) +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm3[1,0] +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; AVX512VL-NEXT: movw %ax, 30(%rdi) -; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovd %xmm0, %eax +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX512VL-NEXT: movw %ax, 28(%rdi) @@ -4243,37 +4259,21 @@ ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3 ; AVX512VL-NEXT: movw %ax, 26(%rdi) ; AVX512VL-NEXT: vmovd %xmm3, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512VL-NEXT: movw %ax, 22(%rdi) -; AVX512VL-NEXT: vmovd %xmm3, %eax -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] ; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: movw %ax, 20(%rdi) +; AVX512VL-NEXT: movw %ax, 22(%rdi) ; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512VL-NEXT: movw %ax, 20(%rdi) +; AVX512VL-NEXT: vmovd %xmm3, %eax ; AVX512VL-NEXT: movw %ax, 18(%rdi) -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512VL-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT: movw %ax, 14(%rdi) ; AVX512VL-NEXT: vmovd %xmm1, %eax -; AVX512VL-NEXT: movw %ax, 12(%rdi) -; AVX512VL-NEXT: vmovd %xmm2, %eax -; AVX512VL-NEXT: movw %ax, 10(%rdi) +; AVX512VL-NEXT: movw %ax, 14(%rdi) ; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: movw %ax, 6(%rdi) -; AVX512VL-NEXT: vmovd %xmm3, %eax -; AVX512VL-NEXT: movw %ax, 4(%rdi) +; AVX512VL-NEXT: movw %ax, 12(%rdi) ; AVX512VL-NEXT: vmovd %xmm4, %eax -; AVX512VL-NEXT: movw %ax, 2(%rdi) +; AVX512VL-NEXT: movw %ax, 10(%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = fptrunc <16 x float> %a0 to <16 x half> @@ -5628,11 +5628,6 @@ ; AVX512-NEXT: subq $200, %rsp ; AVX512-NEXT: movq %rdi, %rbx ; AVX512-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill -; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] @@ -5645,39 +5640,43 @@ ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movl %eax, %r12d +; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill ; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movl %eax, %r13d -; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload -; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 -; AVX512-NEXT: vzeroupper +; AVX512-NEXT: movl %eax, %r12d +; AVX512-NEXT: vpermilpd $1, {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movl %eax, %ebp +; AVX512-NEXT: movl %eax, %r13d ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movl %eax, %r14d +; AVX512-NEXT: movl %eax, %ebp ; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload ; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: movl %eax, %r14d ; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; AVX512-NEXT: callq __truncdfhf2 -; AVX512-NEXT: movw %ax, 12(%rbx) -; AVX512-NEXT: movw %r15w, 8(%rbx) -; AVX512-NEXT: movw %r14w, 4(%rbx) -; AVX512-NEXT: movw %bp, (%rbx) -; AVX512-NEXT: movw %r13w, 14(%rbx) -; AVX512-NEXT: movw %r12w, 10(%rbx) +; AVX512-NEXT: movl %eax, %r15d +; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: # kill: %XMM0 %XMM0 %ZMM0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2 +; AVX512-NEXT: movw %ax, (%rbx) +; AVX512-NEXT: movw %r15w, 12(%rbx) +; AVX512-NEXT: movw %r14w, 8(%rbx) +; AVX512-NEXT: movw %bp, 4(%rbx) +; AVX512-NEXT: movw %r13w, 2(%rbx) +; AVX512-NEXT: movw %r12w, 14(%rbx) ; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload -; AVX512-NEXT: movw %ax, 6(%rbx) +; AVX512-NEXT: movw %ax, 10(%rbx) ; AVX512-NEXT: movzwl {{[0-9]+}}(%rsp), %eax # 2-byte Folded Reload -; AVX512-NEXT: movw %ax, 2(%rbx) +; AVX512-NEXT: movw %ax, 6(%rbx) ; AVX512-NEXT: addq $200, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -843,11 +843,11 @@ ; AVX512-NEXT: vpshufb %xmm7, %xmm14, %xmm5 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm10 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm6 -; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm4 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm6 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm4 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm5 @@ -855,10 +855,10 @@ ; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm4 ; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm7 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm6 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm13 -; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm6 -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm3 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX512-NEXT: vpcmpeqb %ymm3, %ymm8, %ymm8 @@ -868,17 +868,17 @@ ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm7 -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm2 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1] +; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm7 +; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm4 ; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm4 -; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm6 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm4 +; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm6 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = @@ -887,16 +887,16 @@ ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm7 ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm7 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm4 ; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm4 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm4 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]