Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14659,17 +14659,18 @@ if (DAG.getDataLayout().isBigEndian()) return SDValue(); - // TODO: The one-use check is overly conservative. Check the cost of the - // extract instead or remove that condition entirely. auto *Ld = dyn_cast(Extract->getOperand(0)); auto *ExtIdx = dyn_cast(Extract->getOperand(1)); - if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() || - !ExtIdx) + if (!ExtIdx || !Ld || Ld->getExtensionType() || Ld->isVolatile()) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = Extract->getValueType(0); + if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT)) return SDValue(); // The narrow load will be offset from the base address of the old load if // we are extracting from something besides index 0 (little-endian). - EVT VT = Extract->getValueType(0); SDLoc DL(Extract); SDValue BaseAddr = Ld->getOperand(1); unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); Index: test/CodeGen/AArch64/arm64-vabs.ll =================================================================== --- test/CodeGen/AArch64/arm64-vabs.ll +++ test/CodeGen/AArch64/arm64-vabs.ll @@ -138,7 +138,7 @@ define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) { ; CHECK-LABEL: uabdl8h_rdx -; CHECK: uabdl2.8h +; CHECK: uabdl.8h ; CHECK: uabdl.8h %aload = load <16 x i8>, <16 x i8>* %a, align 1 %bload = load <16 x i8>, <16 x i8>* %b, align 1 @@ -156,7 +156,7 @@ define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) { ; CHECK-LABEL: uabdl4s_rdx -; CHECK: uabdl2.4s +; CHECK: uabdl.4s ; CHECK: uabdl.4s %aload = load <8 x i16>, <8 x i16>* %a, align 1 %bload = load <8 x i16>, <8 x i16>* %b, align 1 @@ -174,7 +174,7 @@ define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { ; CHECK: uabdl2d_rdx -; CHECK: uabdl2.2d +; CHECK: uabdl.2d ; CHECK: uabdl.2d %aload = load <4 x i32>, <4 x i32>* %a, align 1 %bload = load <4 x i32>, <4 x i32>* %b, align 1 Index: test/CodeGen/AArch64/merge-store.ll =================================================================== --- test/CodeGen/AArch64/merge-store.ll +++ test/CodeGen/AArch64/merge-store.ll @@ -4,7 +4,7 @@ @g0 = external global <3 x float>, align 16 @g1 = external global <3 x float>, align 4 -; CHECK: ldr q[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]], :lo12:g0 +; CHECK: ldr d[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]], :lo12:g0 ; CHECK: str d[[R0]] define void @blam() { Index: test/CodeGen/ARM/combine-vmovdrr.ll =================================================================== --- test/CodeGen/ARM/combine-vmovdrr.ll +++ test/CodeGen/ARM/combine-vmovdrr.ll @@ -10,7 +10,8 @@ ; ; CHECK-LABEL: motivatingExample: ; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1] -; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0] +; CHECK-NEXT: vld1.8 {[[ARG1_VALlo:d[0-9]+]]}, [r0] +; CHECK-NEXT: vldr [[ARG1_VALhi:d[0-9]+]], [r0] ; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]] ; CHECK-NEXT: vstr [[RES]], [r1] ; CHECK-NEXT: bx lr Index: test/CodeGen/ARM/vext.ll =================================================================== --- test/CodeGen/ARM/vext.ll +++ test/CodeGen/ARM/vext.ll @@ -241,10 +241,10 @@ define <4 x i16> @test_largespan(<8 x i16>* %B) nounwind { ; CHECK-LABEL: test_largespan: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vorr d18, d16, d16 -; CHECK-NEXT: vuzp.16 d18, d17 -; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vld1.16 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vuzp.16 d16, d17 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, <8 x i16>* %B %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> Index: test/CodeGen/ARM/vpadd.ll =================================================================== --- test/CodeGen/ARM/vpadd.ll +++ test/CodeGen/ARM/vpadd.ll @@ -217,7 +217,8 @@ define void @addCombineToVPADD_i8(<16 x i8> *%cbcr, <8 x i8> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADD_i8: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vld1.8 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vpadd.i8 d16, d16, d17 ; CHECK-NEXT: vstr d16, [r1] ; CHECK-NEXT: mov pc, lr @@ -234,7 +235,8 @@ define void @addCombineToVPADD_i16(<8 x i16> *%cbcr, <4 x i16> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADD_i16: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vld1.16 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vpadd.i16 d16, d16, d17 ; CHECK-NEXT: vstr d16, [r1] ; CHECK-NEXT: mov pc, lr @@ -250,7 +252,8 @@ define void @addCombineToVPADD_i32(<4 x i32> *%cbcr, <2 x i32> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADD_i32: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vld1.32 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vpadd.i32 d16, d16, d17 ; CHECK-NEXT: vstr d16, [r1] ; CHECK-NEXT: mov pc, lr @@ -266,7 +269,8 @@ define void @addCombineToVPADDLq_s8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDLq_s8: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vld1.8 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vpaddl.s8 q8, q8 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1] ; CHECK-NEXT: mov pc, lr @@ -310,7 +314,8 @@ define void @addCombineToVPADDLq_u8(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDLq_u8: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vld1.8 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vpaddl.u8 q8, q8 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1] ; CHECK-NEXT: mov pc, lr @@ -329,11 +334,13 @@ define void @addCombineToVPADDLq_u8_early_zext(<16 x i8> *%cbcr, <8 x i16> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDLq_u8_early_zext: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vmovl.u8 q9, d17 -; CHECK-NEXT: vmovl.u8 q8, d16 -; CHECK-NEXT: vuzp.16 q8, q9 -; CHECK-NEXT: vadd.i16 q8, q8, q9 +; CHECK-NEXT: vld1.8 {d16}, [r0:64] +; CHECK-NEXT: add r0, r0, #8 +; CHECK-NEXT: vld1.8 {d17}, [r0:64] +; CHECK-NEXT: vmovl.u8 q9, d16 +; CHECK-NEXT: vmovl.u8 q8, d17 +; CHECK-NEXT: vuzp.16 q9, q8 +; CHECK-NEXT: vadd.i16 q8, q9, q8 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1] ; CHECK-NEXT: mov pc, lr %tmp = load <16 x i8>, <16 x i8>* %cbcr @@ -371,7 +378,7 @@ define void @addCombineToVPADDL_u8_early_zext(<16 x i8> *%cbcr, <4 x i16> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDL_u8_early_zext: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vld1.8 {d16}, [r0:64] ; CHECK-NEXT: vmovl.u8 q8, d16 ; CHECK-NEXT: vpadd.i16 d16, d16, d17 ; CHECK-NEXT: vstr d16, [r1] @@ -389,7 +396,8 @@ define void @addCombineToVPADDLq_s16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDLq_s16: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vld1.16 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vpaddl.s16 q8, q8 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1] ; CHECK-NEXT: mov pc, lr @@ -407,7 +415,8 @@ define void @addCombineToVPADDLq_u16(<8 x i16> *%cbcr, <4 x i32> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDLq_u16: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vld1.16 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vpaddl.u16 q8, q8 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1] ; CHECK-NEXT: mov pc, lr @@ -425,7 +434,8 @@ define void @addCombineToVPADDLq_s32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDLq_s32: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vld1.32 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vpaddl.s32 q8, q8 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1] ; CHECK-NEXT: mov pc, lr @@ -443,7 +453,8 @@ define void @addCombineToVPADDLq_u32(<4 x i32> *%cbcr, <2 x i64> *%X) nounwind ssp { ; CHECK-LABEL: addCombineToVPADDLq_u32: ; CHECK: @ BB#0: -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vld1.32 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] ; CHECK-NEXT: vpaddl.u32 q8, q8 ; CHECK-NEXT: vst1.64 {d16, d17}, [r1] ; CHECK-NEXT: mov pc, lr Index: test/CodeGen/ARM/vzip.ll =================================================================== --- test/CodeGen/ARM/vzip.ll +++ test/CodeGen/ARM/vzip.ll @@ -331,9 +331,10 @@ define void @vzip_vext_factor(<8 x i16>* %A, <4 x i16>* %B) { ; CHECK-LABEL: vzip_vext_factor: ; CHECK: @ BB#0: @ %entry -; CHECK-NEXT: vld1.64 {d16, d17}, [r0] -; CHECK-NEXT: vext.16 d18, d16, d17, #1 -; CHECK-NEXT: vext.16 d16, d18, d17, #2 +; CHECK-NEXT: vld1.16 {d16}, [r0:64]! +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vext.16 d16, d16, d17, #1 +; CHECK-NEXT: vext.16 d16, d16, d17, #2 ; CHECK-NEXT: vext.16 d16, d16, d16, #1 ; CHECK-NEXT: vstr d16, [r1] ; CHECK-NEXT: mov pc, lr Index: test/CodeGen/X86/avx-vperm2x128.ll =================================================================== --- test/CodeGen/X86/avx-vperm2x128.ll +++ test/CodeGen/X86/avx-vperm2x128.ll @@ -53,7 +53,7 @@ ; AVX1-LABEL: shuffle_v8f32_01230123_mem: ; AVX1: ## BB#0: ## %entry ; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8f32_01230123_mem: @@ -194,10 +194,10 @@ define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { ; AVX1-LABEL: shuffle_v16i16_4501_mem: ; AVX1: ## BB#0: ## %entry -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovaps (%rsi), %ymm1 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovaps (%rsi), %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_4501_mem: Index: test/CodeGen/X86/avx-vzeroupper.ll =================================================================== --- test/CodeGen/X86/avx-vzeroupper.ll +++ test/CodeGen/X86/avx-vzeroupper.ll @@ -97,8 +97,8 @@ ret <4 x float> %call3 } -;; Test the pass convergence and also that vzeroupper is only issued when necessary, -;; for this function it should be only once +;; Test the pass convergence and also that vzeroupper is only issued when necessary. +;; For this function, there is no vzeroupper because only half of the 32-byte load was used. define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind { ; VZ-LABEL: test03: @@ -121,9 +121,7 @@ ; VZ-NEXT: # =>This Inner Loop Header: Depth=1 ; VZ-NEXT: callq do_sse ; VZ-NEXT: callq do_sse -; VZ-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; VZ-NEXT: vextractf128 $1, %ymm0, %xmm0 -; VZ-NEXT: vzeroupper +; VZ-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 ; VZ-NEXT: callq do_sse ; VZ-NEXT: decl %ebx ; VZ-NEXT: jne .LBB3_3 @@ -152,8 +150,7 @@ ; FAST-YMM-ZMM-NEXT: # =>This Inner Loop Header: Depth=1 ; FAST-YMM-ZMM-NEXT: callq do_sse ; FAST-YMM-ZMM-NEXT: callq do_sse -; FAST-YMM-ZMM-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; FAST-YMM-ZMM-NEXT: vextractf128 $1, %ymm0, %xmm0 +; FAST-YMM-ZMM-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 ; FAST-YMM-ZMM-NEXT: callq do_sse ; FAST-YMM-ZMM-NEXT: decl %ebx ; FAST-YMM-ZMM-NEXT: jne .LBB3_3 @@ -182,8 +179,7 @@ ; BTVER2-NEXT: # =>This Inner Loop Header: Depth=1 ; BTVER2-NEXT: callq do_sse ; BTVER2-NEXT: callq do_sse -; BTVER2-NEXT: vmovaps {{.*}}(%rip), %ymm0 -; BTVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; BTVER2-NEXT: vmovaps g+{{.*}}(%rip), %xmm0 ; BTVER2-NEXT: callq do_sse ; BTVER2-NEXT: decl %ebx ; BTVER2-NEXT: jne .LBB3_3 Index: test/CodeGen/X86/avx512-cvt.ll =================================================================== --- test/CodeGen/X86/avx512-cvt.ll +++ test/CodeGen/X86/avx512-cvt.ll @@ -117,46 +117,28 @@ } define <4 x float> @sltof4f32_mem(<4 x i64>* %a) { -; KNL-LABEL: sltof4f32_mem: -; KNL: # BB#0: -; KNL-NEXT: vmovdqu (%rdi), %ymm0 -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; KNL-NEXT: vpextrq $1, %xmm0, %rax -; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; KNL-NEXT: retq +; NODQ-LABEL: sltof4f32_mem: +; NODQ: # BB#0: +; NODQ-NEXT: vmovdqu (%rdi), %xmm0 +; NODQ-NEXT: vmovdqu 16(%rdi), %xmm1 +; NODQ-NEXT: vpextrq $1, %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; NODQ-NEXT: vmovq %xmm0, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; NODQ-NEXT: vmovq %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; NODQ-NEXT: vpextrq $1, %xmm1, %rax +; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; NODQ-NEXT: retq ; ; VLDQ-LABEL: sltof4f32_mem: ; VLDQ: # BB#0: ; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 ; VLDQ-NEXT: retq ; -; VLNODQ-LABEL: sltof4f32_mem: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vmovdqu (%rdi), %ymm0 -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; VLNODQ-NEXT: vzeroupper -; VLNODQ-NEXT: retq -; ; AVX512DQ-LABEL: sltof4f32_mem: ; AVX512DQ: # BB#0: ; AVX512DQ-NEXT: vmovups (%rdi), %ymm0 @@ -164,24 +146,6 @@ ; AVX512DQ-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: sltof4f32_mem: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax -; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a1 = load <4 x i64>, <4 x i64>* %a, align 8 %b = sitofp <4 x i64> %a1 to <4 x float> ret <4 x float>%b Index: test/CodeGen/X86/nontemporal-loads.ll =================================================================== --- test/CodeGen/X86/nontemporal-loads.ll +++ test/CodeGen/X86/nontemporal-loads.ll @@ -810,12 +810,12 @@ ; ; AVX1-LABEL: test_arg_v8i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v8i32: @@ -885,12 +885,12 @@ ; ; AVX1-LABEL: test_arg_v4i64: ; AVX1: # BB#0: -; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v4i64: @@ -918,12 +918,12 @@ ; ; AVX1-LABEL: test_arg_v16i16: ; AVX1: # BB#0: -; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v16i16: @@ -951,12 +951,12 @@ ; ; AVX1-LABEL: test_arg_v32i8: ; AVX1: # BB#0: -; AVX1-NEXT: vmovntdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v32i8: @@ -1027,18 +1027,18 @@ ; ; AVX1-LABEL: test_arg_v16i32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v16i32: @@ -1109,18 +1109,18 @@ ; ; AVX1-LABEL: test_arg_v8i64: ; AVX1: # BB#0: -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v8i64: @@ -1152,18 +1152,18 @@ ; ; AVX1-LABEL: test_arg_v32i16: ; AVX1: # BB#0: -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddw %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddw %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 +; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v32i16: @@ -1211,18 +1211,18 @@ ; ; AVX1-LABEL: test_arg_v64i8: ; AVX1: # BB#0: -; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovntdqa (%rdi), %xmm4 -; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa (%rdi), %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_arg_v64i8: Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -1180,9 +1180,9 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vmovups (%rdi), %ymm0 ; AVX1-NEXT: vmovups 32(%rdi), %ymm1 -; AVX1-NEXT: vmovups 64(%rdi), %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm2[2],xmm3[1] +; AVX1-NEXT: vmovups 80(%rdi), %xmm2 +; AVX1-NEXT: vmovups 64(%rdi), %xmm3 +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = zero,zero,xmm3[2],xmm2[1] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 @@ -1191,7 +1191,7 @@ ; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3] -; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm3[2],xmm2[3] +; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm2[2],xmm3[3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,0,3,2] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -1201,7 +1201,7 @@ ; AVX1-NEXT: vmovshdup {{.*#+}} xmm7 = xmm7[1,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -1347,35 +1347,36 @@ ; ; AVX1-LABEL: interleave_24i32_in: ; AVX1: # BB#0: -; AVX1-NEXT: vmovups (%rsi), %ymm0 -; AVX1-NEXT: vmovups (%rdx), %ymm1 -; AVX1-NEXT: vmovupd (%rcx), %ymm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1],xmm3[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,0],xmm0[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm2[0,0] +; AVX1-NEXT: vmovupd (%rsi), %ymm0 +; AVX1-NEXT: vmovupd (%rcx), %ymm1 +; AVX1-NEXT: vmovups 16(%rcx), %xmm2 +; AVX1-NEXT: vmovups (%rdx), %xmm3 +; AVX1-NEXT: vmovups 16(%rdx), %xmm4 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX1-NEXT: vmovups (%rsi), %xmm4 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,0],xmm4[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,1],xmm6[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm5[1,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],xmm5[2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm0[1,1,3,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] +; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) -; AVX1-NEXT: vmovups %ymm4, 64(%rdi) ; AVX1-NEXT: vmovups %ymm3, (%rdi) +; AVX1-NEXT: vmovups %ymm2, 64(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1388,7 +1389,7 @@ ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1,0,1] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-NEXT: vpbroadcastq %xmm2, %ymm4 +; AVX2-NEXT: vpbroadcastq (%rcx), %ymm4 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,1,3,3] ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] Index: test/CodeGen/X86/pr22774.ll =================================================================== --- test/CodeGen/X86/pr22774.ll +++ test/CodeGen/X86/pr22774.ll @@ -7,11 +7,9 @@ define i32 @_Z3foov() { ; CHECK-LABEL: _Z3foov: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: vmovdqa {{.*}}(%rip), %ymm0 -; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; CHECK-NEXT: vmovdqa %xmm0, {{.*}}(%rip) +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovaps %xmm0, {{.*}}(%rip) ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: %0 = load <4 x i64>, <4 x i64>* @in, align 32 Index: test/CodeGen/X86/sandybridge-loads.ll =================================================================== --- test/CodeGen/X86/sandybridge-loads.ll +++ test/CodeGen/X86/sandybridge-loads.ll @@ -30,9 +30,10 @@ ; CHECK-LABEL: widestores: ; CHECK: # BB#0: ; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vmovaps (%rsi), %ymm1 +; CHECK-NEXT: vmovaps (%rsi), %xmm1 +; CHECK-NEXT: vmovaps 16(%rsi), %xmm2 ; CHECK-NEXT: vmovaps %ymm0, (%rsi) -; CHECK-NEXT: vextractf128 $1, %ymm1, 16(%rdi) +; CHECK-NEXT: vmovaps %xmm2, 16(%rdi) ; CHECK-NEXT: vmovaps %xmm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq Index: test/CodeGen/X86/shuffle-vs-trunc-256.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -12,62 +12,57 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { ; AVX-LABEL: shuffle_v32i8_to_v16i8: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v16i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BWVL-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512BWVL-NEXT: vmovdqu 16(%rdi), %xmm1 ; AVX512BWVL-NEXT: vmovdqu {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> @@ -128,68 +123,59 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { ; AVX-LABEL: shuffle_v16i16_to_v8i16: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v16i16_to_v8i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] ; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[0,2,2,3,4,5,6,7] +; AVX512VL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] ; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[0,2,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512BWVL-NEXT: vmovdqu %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -246,48 +232,17 @@ define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { ; AVX-LABEL: shuffle_v8i32_to_v4i32: ; AVX: # BB#0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX-NEXT: vmovaps %xmm0, (%rsi) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; -; AVX512F-LABEL: shuffle_v8i32_to_v4i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512F-NEXT: vmovaps %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i32_to_v4i32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512VL-NEXT: vmovaps %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v8i32_to_v4i32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32: -; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512BWVL-NEXT: vmovaps %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: shuffle_v8i32_to_v4i32: +; AVX512: # BB#0: +; AVX512-NEXT: vmovaps (%rdi), %xmm0 +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] +; AVX512-NEXT: vmovaps %xmm0, (%rsi) +; AVX512-NEXT: retq %vec = load <8 x i32>, <8 x i32>* %L %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> store <4 x i32> %strided.vec, <4 x i32>* %S @@ -342,65 +297,58 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { ; AVX-LABEL: shuffle_v32i8_to_v8i8: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] ; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[0,2,2,3,4,5,6,7] +; AVX512BWVL-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> @@ -460,59 +408,46 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { ; AVX-LABEL: shuffle_v16i16_to_v4i16: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v16i16_to_v4i16: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] +; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %L %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> @@ -571,56 +506,49 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { ; AVX-LABEL: shuffle_v32i8_to_v4i8: ; AVX: # BB#0: -; AVX-NEXT: vmovdqa (%rdi), %ymm0 -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v4i8: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; AVX512BWVL-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BWVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <32 x i8>, <32 x i8>* %L %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> Index: test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -35,8 +35,8 @@ ; ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -47,8 +47,8 @@ ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu8 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] ; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -132,13 +132,11 @@ ; ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -146,11 +144,10 @@ ; ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16: ; AVX512BWVL: # BB#0: -; AVX512BWVL-NEXT: vmovdqu16 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30] -; AVX512BWVL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqu {{.*#+}} ymm1 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30] +; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 +; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,2,1,3] ; AVX512BWVL-NEXT: vmovdqu %ymm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -177,9 +174,8 @@ define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { ; AVX512-LABEL: shuffle_v16i32_to_v8i32: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512-NEXT: vmovaps (%rdi), %ymm0 +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512-NEXT: vmovdqa %ymm0, (%rsi) ; AVX512-NEXT: vzeroupper Index: test/CodeGen/X86/subvector-broadcast.ll =================================================================== --- test/CodeGen/X86/subvector-broadcast.ll +++ test/CodeGen/X86/subvector-broadcast.ll @@ -1287,21 +1287,22 @@ ; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: ; X32-AVX1: ## BB#0: ## %entry ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,2,0,3,0,4,0] -; X32-AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 -; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 -; X32-AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,0,2,0,3,0,4,0] +; X32-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,0,2,0,3,0,4,0] +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 -; X32-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 -; X32-AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; X32-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; X32-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] +; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; X32-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6 +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 +; X32-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 +; X32-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; X32-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X32-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X32-AVX1-NEXT: vmovups %ymm0, _ga4 ; X32-AVX1-NEXT: vmovups %ymm2, _gb4+32 ; X32-AVX1-NEXT: vmovups %ymm1, _gb4 Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -2750,44 +2750,27 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: retq ; -; AVX1-LABEL: sitofp_load_4i64_to_4f64: -; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 -; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_load_4i64_to_4f64: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 -; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; VEX-LABEL: sitofp_load_4i64_to_4f64: +; VEX: # BB#0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm1 +; VEX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm2 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm0 +; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_4i64_to_4f64: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax ; AVX512F-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax @@ -2803,8 +2786,8 @@ ; ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -3162,52 +3145,31 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: retq ; -; AVX1-LABEL: uitofp_load_4i64_to_4f64: -; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] -; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-NEXT: vsubpd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vhaddpd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-NEXT: vsubpd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vhaddpd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_4i64_to_4f64: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] -; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-NEXT: vsubpd %xmm4, %xmm1, %xmm1 -; AVX2-NEXT: vhaddpd %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-NEXT: vsubpd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-NEXT: vsubpd %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vhaddpd %xmm0, %xmm3, %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; VEX-LABEL: uitofp_load_4i64_to_4f64: +; VEX: # BB#0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; VEX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; VEX-NEXT: vmovapd {{.*#+}} xmm4 = [4.503600e+15,1.934281e+25] +; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3 +; VEX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; VEX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; VEX-NEXT: vsubpd %xmm4, %xmm1, %xmm1 +; VEX-NEXT: vhaddpd %xmm1, %xmm3, %xmm1 +; VEX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; VEX-NEXT: vsubpd %xmm4, %xmm3, %xmm3 +; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; VEX-NEXT: vsubpd %xmm4, %xmm0, %xmm0 +; VEX-NEXT: vhaddpd %xmm0, %xmm3, %xmm0 +; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_4i64_to_4f64: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax ; AVX512F-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax @@ -3223,8 +3185,8 @@ ; ; AVX512VL-LABEL: uitofp_load_4i64_to_4f64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax ; AVX512VL-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax @@ -3400,76 +3362,55 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: retq ; -; AVX1-LABEL: sitofp_load_4i64_to_4f32: -; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_load_4i64_to_4f32: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: sitofp_load_4i64_to_4f32: +; VEX: # BB#0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_4i64_to_4f32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_load_4i64_to_4f32: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32: @@ -3585,128 +3526,97 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: retq ; -; AVX1-LABEL: sitofp_load_8i64_to_8f32: -; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_load_8i64_to_8f32: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm1 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; VEX-LABEL: sitofp_load_8i64_to_8f32: +; VEX: # BB#0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 +; VEX-NEXT: vmovdqa 32(%rdi), %xmm2 +; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 +; VEX-NEXT: vpextrq $1, %xmm2, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; VEX-NEXT: vmovq %xmm2, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; VEX-NEXT: vmovq %xmm3, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; VEX-NEXT: vpextrq $1, %xmm3, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; VEX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_8i64_to_8f32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512F-NEXT: vmovq %xmm3, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_load_8i64_to_8f32: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512VL-NEXT: vmovq %xmm3, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512VL-NEXT: vextracti32x4 $1, %zmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32: @@ -3893,174 +3803,103 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; -; AVX1-LABEL: uitofp_load_4i64_to_4f32: -; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB76_1 -; AVX1-NEXT: # BB#2: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX1-NEXT: jmp .LBB76_3 -; AVX1-NEXT: .LBB76_1: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: .LBB76_3: -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB76_4 -; AVX1-NEXT: # BB#5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX1-NEXT: jmp .LBB76_6 -; AVX1-NEXT: .LBB76_4: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB76_6: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB76_7 -; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX1-NEXT: jmp .LBB76_9 -; AVX1-NEXT: .LBB76_7: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB76_9: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB76_10 -; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB76_10: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_4i64_to_4f32: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB76_1 -; AVX2-NEXT: # BB#2: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX2-NEXT: jmp .LBB76_3 -; AVX2-NEXT: .LBB76_1: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: .LBB76_3: -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB76_4 -; AVX2-NEXT: # BB#5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX2-NEXT: jmp .LBB76_6 -; AVX2-NEXT: .LBB76_4: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB76_6: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB76_7 -; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX2-NEXT: jmp .LBB76_9 -; AVX2-NEXT: .LBB76_7: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB76_9: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB76_10 -; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; AVX2-NEXT: .LBB76_10: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: uitofp_load_4i64_to_4f32: +; VEX: # BB#0: +; VEX-NEXT: vmovdqa (%rdi), %xmm2 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 +; VEX-NEXT: vpextrq $1, %xmm2, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_1 +; VEX-NEXT: # BB#2: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: jmp .LBB76_3 +; VEX-NEXT: .LBB76_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 +; VEX-NEXT: .LBB76_3: +; VEX-NEXT: vmovq %xmm2, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_4 +; VEX-NEXT: # BB#5: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: jmp .LBB76_6 +; VEX-NEXT: .LBB76_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB76_6: +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_7 +; VEX-NEXT: # BB#8: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: jmp .LBB76_9 +; VEX-NEXT: .LBB76_7: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB76_9: +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB76_10 +; VEX-NEXT: # BB#11: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VEX-NEXT: retq +; VEX-NEXT: .LBB76_10: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_4i64_to_4f32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax ; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_4i64_to_4f32: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax ; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32: @@ -4324,320 +4163,193 @@ ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: retq ; -; AVX1-LABEL: uitofp_load_8i64_to_8f32: -; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_1 -; AVX1-NEXT: # BB#2: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX1-NEXT: jmp .LBB80_3 -; AVX1-NEXT: .LBB80_1: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: .LBB80_3: -; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_4 -; AVX1-NEXT: # BB#5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX1-NEXT: jmp .LBB80_6 -; AVX1-NEXT: .LBB80_4: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: .LBB80_6: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_7 -; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX1-NEXT: jmp .LBB80_9 -; AVX1-NEXT: .LBB80_7: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: .LBB80_9: -; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_10 -; AVX1-NEXT: # BB#11: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 -; AVX1-NEXT: jmp .LBB80_12 -; AVX1-NEXT: .LBB80_10: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB80_12: -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_13 -; AVX1-NEXT: # BB#14: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 -; AVX1-NEXT: jmp .LBB80_15 -; AVX1-NEXT: .LBB80_13: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 -; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: .LBB80_15: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_16 -; AVX1-NEXT: # BB#17: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX1-NEXT: jmp .LBB80_18 -; AVX1-NEXT: .LBB80_16: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: .LBB80_18: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_19 -; AVX1-NEXT: # BB#20: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 -; AVX1-NEXT: jmp .LBB80_21 -; AVX1-NEXT: .LBB80_19: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 -; AVX1-NEXT: .LBB80_21: -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm4, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB80_22 -; AVX1-NEXT: # BB#23: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX1-NEXT: jmp .LBB80_24 -; AVX1-NEXT: .LBB80_22: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB80_24: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_8i64_to_8f32: -; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_1 -; AVX2-NEXT: # BB#2: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX2-NEXT: jmp .LBB80_3 -; AVX2-NEXT: .LBB80_1: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1 -; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: .LBB80_3: -; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_4 -; AVX2-NEXT: # BB#5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX2-NEXT: jmp .LBB80_6 -; AVX2-NEXT: .LBB80_4: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: .LBB80_6: -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_7 -; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX2-NEXT: jmp .LBB80_9 -; AVX2-NEXT: .LBB80_7: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: .LBB80_9: -; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_10 -; AVX2-NEXT: # BB#11: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 -; AVX2-NEXT: jmp .LBB80_12 -; AVX2-NEXT: .LBB80_10: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB80_12: -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_13 -; AVX2-NEXT: # BB#14: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 -; AVX2-NEXT: jmp .LBB80_15 -; AVX2-NEXT: .LBB80_13: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 -; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: .LBB80_15: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_16 -; AVX2-NEXT: # BB#17: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX2-NEXT: jmp .LBB80_18 -; AVX2-NEXT: .LBB80_16: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: .LBB80_18: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_19 -; AVX2-NEXT: # BB#20: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 -; AVX2-NEXT: jmp .LBB80_21 -; AVX2-NEXT: .LBB80_19: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 -; AVX2-NEXT: .LBB80_21: -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX2-NEXT: vpextrq $1, %xmm4, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB80_22 -; AVX2-NEXT: # BB#23: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX2-NEXT: jmp .LBB80_24 -; AVX2-NEXT: .LBB80_22: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB80_24: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; VEX-LABEL: uitofp_load_8i64_to_8f32: +; VEX: # BB#0: +; VEX-NEXT: vmovdqa (%rdi), %xmm1 +; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 +; VEX-NEXT: vmovdqa 32(%rdi), %xmm4 +; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 +; VEX-NEXT: vpextrq $1, %xmm4, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_1 +; VEX-NEXT: # BB#2: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VEX-NEXT: jmp .LBB80_3 +; VEX-NEXT: .LBB80_1: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2 +; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; VEX-NEXT: .LBB80_3: +; VEX-NEXT: vmovq %xmm4, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_4 +; VEX-NEXT: # BB#5: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 +; VEX-NEXT: jmp .LBB80_6 +; VEX-NEXT: .LBB80_4: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm4 +; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm5 +; VEX-NEXT: .LBB80_6: +; VEX-NEXT: vmovq %xmm3, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_7 +; VEX-NEXT: # BB#8: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; VEX-NEXT: jmp .LBB80_9 +; VEX-NEXT: .LBB80_7: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; VEX-NEXT: .LBB80_9: +; VEX-NEXT: vpextrq $1, %xmm3, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_10 +; VEX-NEXT: # BB#11: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; VEX-NEXT: jmp .LBB80_12 +; VEX-NEXT: .LBB80_10: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; VEX-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; VEX-NEXT: .LBB80_12: +; VEX-NEXT: vpextrq $1, %xmm1, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_13 +; VEX-NEXT: # BB#14: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm6 +; VEX-NEXT: jmp .LBB80_15 +; VEX-NEXT: .LBB80_13: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm6 +; VEX-NEXT: vaddss %xmm6, %xmm6, %xmm6 +; VEX-NEXT: .LBB80_15: +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3] +; VEX-NEXT: vmovq %xmm1, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_16 +; VEX-NEXT: # BB#17: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm5 +; VEX-NEXT: jmp .LBB80_18 +; VEX-NEXT: .LBB80_16: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm1 +; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm5 +; VEX-NEXT: .LBB80_18: +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3] +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[2,3] +; VEX-NEXT: vmovq %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_19 +; VEX-NEXT: # BB#20: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm4 +; VEX-NEXT: jmp .LBB80_21 +; VEX-NEXT: .LBB80_19: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm4 +; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; VEX-NEXT: .LBB80_21: +; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] +; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; VEX-NEXT: vpextrq $1, %xmm0, %rax +; VEX-NEXT: testq %rax, %rax +; VEX-NEXT: js .LBB80_22 +; VEX-NEXT: # BB#23: +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm0 +; VEX-NEXT: jmp .LBB80_24 +; VEX-NEXT: .LBB80_22: +; VEX-NEXT: movq %rax, %rcx +; VEX-NEXT: shrq %rcx +; VEX-NEXT: andl $1, %eax +; VEX-NEXT: orq %rcx, %rax +; VEX-NEXT: vcvtsi2ssq %rax, %xmm7, %xmm0 +; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; VEX-NEXT: .LBB80_24: +; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] +; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_8i64_to_8f32: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4 +; AVX512F-NEXT: vmovq %xmm2, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512F-NEXT: vmovq %xmm3, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512F-NEXT: vpextrq $1, %xmm3, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512F-NEXT: vmovq %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512F-NEXT: vpextrq $1, %xmm1, %rax +; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1 +; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_8i64_to_8f32: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm4 +; AVX512VL-NEXT: vmovq %xmm2, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm2 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX512VL-NEXT: vmovq %xmm3, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm4 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] +; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; AVX512VL-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm0 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; AVX512VL-NEXT: vmovq %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm3 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax +; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm5, %xmm1 +; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32: @@ -4680,9 +4392,10 @@ ; ; AVX1-LABEL: uitofp_load_8i32_to_8f32: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovaps (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 ; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 Index: test/CodeGen/X86/vector-compare-results.ll =================================================================== --- test/CodeGen/X86/vector-compare-results.ll +++ test/CodeGen/X86/vector-compare-results.ll @@ -10169,57 +10169,41 @@ ; AVX1-NEXT: movq %rsp, %rbp ; AVX1-NEXT: andq $-32, %rsp ; AVX1-NEXT: subq $32, %rsp -; AVX1-NEXT: vmovdqa 240(%rbp), %ymm8 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm10 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm10, %xmm9 -; AVX1-NEXT: vmovdqa 208(%rbp), %ymm10 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm7, %xmm7 -; AVX1-NEXT: vpacksswb %xmm9, %xmm7, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm9 +; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 +; AVX1-NEXT: vpcmpgtq 256(%rbp), %xmm8, %xmm8 +; AVX1-NEXT: vpcmpgtq 240(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vpacksswb %xmm8, %xmm7, %xmm8 ; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm7, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm10, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa 176(%rbp), %ymm9 +; AVX1-NEXT: vpcmpgtq 224(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vpcmpgtq 208(%rbp), %xmm6, %xmm6 ; AVX1-NEXT: vpacksswb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpacksswb %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vmovdqa 144(%rbp), %ymm10 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm5, %xmm5 -; AVX1-NEXT: vpacksswb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm6 +; AVX1-NEXT: vpacksswb %xmm8, %xmm6, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 +; AVX1-NEXT: vpcmpgtq 192(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vpcmpgtq 176(%rbp), %xmm5, %xmm5 +; AVX1-NEXT: vpacksswb %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm10, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vmovdqa 112(%rbp), %ymm6 +; AVX1-NEXT: vpcmpgtq 160(%rbp), %xmm7, %xmm7 +; AVX1-NEXT: vpcmpgtq 144(%rbp), %xmm4, %xmm4 +; AVX1-NEXT: vpacksswb %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpacksswb %xmm8, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vmovdqa 80(%rbp), %ymm7 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpacksswb %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpcmpgtq 128(%rbp), %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq 112(%rbp), %xmm3, %xmm3 ; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa 48(%rbp), %ymm6 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpcmpgtq 96(%rbp), %xmm5, %xmm5 +; AVX1-NEXT: vpcmpgtq 80(%rbp), %xmm2, %xmm2 ; AVX1-NEXT: vpacksswb %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vmovdqa 16(%rbp), %ymm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpcmpgtq 64(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq 48(%rbp), %xmm1, %xmm1 ; AVX1-NEXT: vpacksswb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpcmpgtq 32(%rbp), %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq 16(%rbp), %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 Index: test/CodeGen/X86/vector-shift-ashr-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-ashr-256.ll +++ test/CodeGen/X86/vector-shift-ashr-256.ll @@ -1187,29 +1187,29 @@ ; ; X32-AVX1-LABEL: constant_shift_v4i64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] -; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] -; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] -; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] -; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2 -; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] -; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 -; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 -; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] -; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 -; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; X32-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm4, %xmm2, %xmm5 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; X32-AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm4 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm5, %xmm1 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vpsrlq %xmm3, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX2-LABEL: constant_shift_v4i64: Index: test/CodeGen/X86/vector-shift-lshr-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-lshr-256.ll +++ test/CodeGen/X86/vector-shift-lshr-256.ll @@ -934,18 +934,18 @@ ; ; X32-AVX1-LABEL: constant_shift_v4i64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] -; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm1 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX2-LABEL: constant_shift_v4i64: Index: test/CodeGen/X86/vector-shift-shl-256.ll =================================================================== --- test/CodeGen/X86/vector-shift-shl-256.ll +++ test/CodeGen/X86/vector-shift-shl-256.ll @@ -861,18 +861,18 @@ ; ; X32-AVX1-LABEL: constant_shift_v4i64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] -; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] ; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 -; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm1 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX2-LABEL: constant_shift_v4i64: Index: test/CodeGen/X86/viabs.ll =================================================================== --- test/CodeGen/X86/viabs.ll +++ test/CodeGen/X86/viabs.ll @@ -573,24 +573,24 @@ ; ; AVX1-LABEL: test_abs_le_v8i64_fold: ; AVX1: # BB#0: -; AVX1-NEXT: vmovdqu (%rdi), %ymm0 -; AVX1-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm5 +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm7, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm5 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -7,15 +7,13 @@ ; AVX: # BB#0: ; AVX-NEXT: vmovupd (%rdi), %ymm0 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX-NEXT: vmovupd 64(%rdi), %ymm2 -; AVX-NEXT: vmovupd 96(%rdi), %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2 +; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm2 +; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm3 +; AVX-NEXT: vhaddpd %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: retq @@ -35,15 +33,13 @@ ; AVX: # BB#0: ; AVX-NEXT: vmovupd (%rdi), %ymm0 ; AVX-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX-NEXT: vmovupd 64(%rdi), %ymm2 -; AVX-NEXT: vmovupd 96(%rdi), %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm2 +; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm3 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0 +; AVX-NEXT: vmulpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> @@ -84,26 +80,24 @@ ; AVX1: # BB#0: ; AVX1-NEXT: vmovupd (%rdi), %ymm0 ; AVX1-NEXT: vmovupd 32(%rdi), %ymm1 -; AVX1-NEXT: vmovupd 64(%rdi), %ymm2 -; AVX1-NEXT: vmovupd 96(%rdi), %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX1-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm2 +; AVX1-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm3 +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 +; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -111,19 +105,17 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqu 64(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu 96(%rdi), %ymm3 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm5 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm0, %ymm2 +; AVX2-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm3 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],mem[2,3] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm5 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3] +; AVX2-NEXT: vpaddq %ymm5, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: retq %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16 %strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32>