Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6734,189 +6734,130 @@ } define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) { -; X86-LABEL: test_mm512_reduce_add_epi32: -; X86: # %bb.0: # %entry -; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; X86-NEXT: vmovd %xmm0, %eax -; X86-NEXT: vzeroupper -; X86-NEXT: retl -; -; X64-LABEL: test_mm512_reduce_add_epi32: -; X64: # %bb.0: # %entry -; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; CHECK-LABEL: test_mm512_reduce_add_epi32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} entry: - %0 = bitcast <8 x i64> %__W to <16 x i32> - %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> - %add.i = add <8 x i32> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> - %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> - %add7.i = add <4 x i32> %shuffle6.i, %add4.i - %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> - %add10.i = add <4 x i32> %shuffle9.i, %add7.i - %1 = bitcast <4 x i32> %add10.i to <2 x i64> - %vecext.i = extractelement <2 x i64> %1, i32 0 - %conv.i = trunc i64 %vecext.i to i32 - ret i32 %conv.i + %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + %0 = bitcast <4 x i64> %extract.i to <8 x i32> + %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + %1 = bitcast <4 x i64> %extract2.i to <8 x i32> + %add.i = add <8 x i32> %0, %1 + %2 = bitcast <8 x i32> %add.i to <4 x i64> + %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> + %3 = bitcast <2 x i64> %extract3.i to <4 x i32> + %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> + %4 = bitcast <2 x i64> %extract4.i to <4 x i32> + %add5.i = add <4 x i32> %3, %4 + %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> + %add6.i = add <4 x i32> %shuffle.i, %add5.i + %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> + %add8.i = add <4 x i32> %shuffle7.i, %add6.i + %vecext.i = extractelement <4 x i32> %add8.i, i32 0 + ret i32 %vecext.i } define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) { -; X86-LABEL: test_mm512_reduce_mul_epi32: -; X86: # %bb.0: # %entry -; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X86-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-NEXT: vmovd %xmm0, %eax -; X86-NEXT: vzeroupper -; X86-NEXT: retl -; -; X64-LABEL: test_mm512_reduce_mul_epi32: -; X64: # %bb.0: # %entry -; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; CHECK-LABEL: test_mm512_reduce_mul_epi32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} entry: - %0 = bitcast <8 x i64> %__W to <16 x i32> - %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> - %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> - %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> - %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i - %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> - %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i - %1 = bitcast <4 x i32> %mul10.i to <2 x i64> - %vecext.i = extractelement <2 x i64> %1, i32 0 - %conv.i = trunc i64 %vecext.i to i32 - ret i32 %conv.i + %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + %0 = bitcast <4 x i64> %extract.i to <8 x i32> + %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + %1 = bitcast <4 x i64> %extract2.i to <8 x i32> + %mul.i = mul <8 x i32> %0, %1 + %2 = bitcast <8 x i32> %mul.i to <4 x i64> + %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> + %3 = bitcast <2 x i64> %extract3.i to <4 x i32> + %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> + %4 = bitcast <2 x i64> %extract4.i to <4 x i32> + %mul5.i = mul <4 x i32> %3, %4 + %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> + %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i + %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> + %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i + %vecext.i = extractelement <4 x i32> %mul8.i, i32 0 + ret i32 %vecext.i } define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) { -; X86-LABEL: test_mm512_reduce_or_epi32: -; X86: # %bb.0: # %entry -; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 -; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X86-NEXT: vmovd %xmm0, %eax -; X86-NEXT: vzeroupper -; X86-NEXT: retl -; -; X64-LABEL: test_mm512_reduce_or_epi32: -; X64: # %bb.0: # %entry -; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 -; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; CHECK-LABEL: test_mm512_reduce_or_epi32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} entry: - %0 = bitcast <8 x i64> %__W to <16 x i32> - %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> - %or.i = or <8 x i32> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> - %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> - %or7.i = or <4 x i32> %shuffle6.i, %or4.i - %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> - %or10.i = or <4 x i32> %shuffle9.i, %or7.i - %1 = bitcast <4 x i32> %or10.i to <2 x i64> - %vecext.i = extractelement <2 x i64> %1, i32 0 - %conv.i = trunc i64 %vecext.i to i32 - ret i32 %conv.i + %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + %or25.i = or <4 x i64> %extract.i, %extract2.i + %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> + %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> + %or526.i = or <2 x i64> %extract3.i, %extract4.i + %or5.i = bitcast <2 x i64> %or526.i to <4 x i32> + %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> + %or6.i = or <4 x i32> %shuffle.i, %or5.i + %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> + %or8.i = or <4 x i32> %shuffle7.i, %or6.i + %vecext.i = extractelement <4 x i32> %or8.i, i32 0 + ret i32 %vecext.i } define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) { -; X86-LABEL: test_mm512_reduce_and_epi32: -; X86: # %bb.0: # %entry -; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X86-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X86-NEXT: vmovd %xmm0, %eax -; X86-NEXT: vzeroupper -; X86-NEXT: retl -; -; X64-LABEL: test_mm512_reduce_and_epi32: -; X64: # %bb.0: # %entry -; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; CHECK-LABEL: test_mm512_reduce_and_epi32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} entry: - %0 = bitcast <8 x i64> %__W to <16 x i32> - %shuffle.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x i32> %0, <16 x i32> undef, <8 x i32> - %and.i = and <8 x i32> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> - %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> - %and7.i = and <4 x i32> %shuffle6.i, %and4.i - %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> - %and10.i = and <4 x i32> %shuffle9.i, %and7.i - %1 = bitcast <4 x i32> %and10.i to <2 x i64> - %vecext.i = extractelement <2 x i64> %1, i32 0 - %conv.i = trunc i64 %vecext.i to i32 - ret i32 %conv.i + %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> + %and25.i = and <4 x i64> %extract.i, %extract2.i + %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> + %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> + %and526.i = and <2 x i64> %extract3.i, %extract4.i + %and5.i = bitcast <2 x i64> %and526.i to <4 x i32> + %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> + %and6.i = and <4 x i32> %shuffle.i, %and5.i + %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> + %and8.i = and <4 x i32> %shuffle7.i, %and6.i + %vecext.i = extractelement <4 x i32> %and8.i, i32 0 + ret i32 %vecext.i } define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) { @@ -6946,28 +6887,31 @@ ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: %0 = bitcast <8 x i64> %__W to <16 x i32> %1 = bitcast i16 %__M to <16 x i1> %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer - %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> - %add.i = add <8 x i32> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x i32> %add.i, <8 x i32> undef, <4 x i32> - %add4.i = add <4 x i32> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x i32> %add4.i, <4 x i32> undef, <4 x i32> - %add7.i = add <4 x i32> %shuffle6.i, %add4.i - %shuffle9.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> - %add10.i = add <4 x i32> %shuffle9.i, %add7.i - %3 = bitcast <4 x i32> %add10.i to <2 x i64> - %vecext.i = extractelement <2 x i64> %3, i32 0 - %conv.i = trunc i64 %vecext.i to i32 - ret i32 %conv.i + %3 = bitcast <16 x i32> %2 to <8 x i64> + %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> + %4 = bitcast <4 x i64> %extract.i to <8 x i32> + %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> + %5 = bitcast <4 x i64> %extract3.i to <8 x i32> + %add.i = add <8 x i32> %4, %5 + %6 = bitcast <8 x i32> %add.i to <4 x i64> + %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> + %7 = bitcast <2 x i64> %extract4.i to <4 x i32> + %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> + %8 = bitcast <2 x i64> %extract5.i to <4 x i32> + %add6.i = add <4 x i32> %7, %8 + %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> + %add7.i = add <4 x i32> %shuffle.i, %add6.i + %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> + %add9.i = add <4 x i32> %shuffle8.i, %add7.i + %vecext.i = extractelement <4 x i32> %add9.i, i32 0 + ret i32 %vecext.i } define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) { @@ -7001,28 +6945,31 @@ ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: %0 = bitcast <8 x i64> %__W to <16 x i32> %1 = bitcast i16 %__M to <16 x i1> %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> - %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> - %mul.i = mul <8 x i32> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x i32> %mul.i, <8 x i32> undef, <4 x i32> - %mul4.i = mul <4 x i32> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x i32> %mul4.i, <4 x i32> undef, <4 x i32> - %mul7.i = mul <4 x i32> %shuffle6.i, %mul4.i - %shuffle9.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> - %mul10.i = mul <4 x i32> %shuffle9.i, %mul7.i - %3 = bitcast <4 x i32> %mul10.i to <2 x i64> - %vecext.i = extractelement <2 x i64> %3, i32 0 - %conv.i = trunc i64 %vecext.i to i32 - ret i32 %conv.i + %3 = bitcast <16 x i32> %2 to <8 x i64> + %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> + %4 = bitcast <4 x i64> %extract.i to <8 x i32> + %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> + %5 = bitcast <4 x i64> %extract4.i to <8 x i32> + %mul.i = mul <8 x i32> %4, %5 + %6 = bitcast <8 x i32> %mul.i to <4 x i64> + %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> + %7 = bitcast <2 x i64> %extract5.i to <4 x i32> + %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> + %8 = bitcast <2 x i64> %extract6.i to <4 x i32> + %mul7.i = mul <4 x i32> %7, %8 + %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> + %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i + %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> + %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i + %vecext.i = extractelement <4 x i32> %mul10.i, i32 0 + ret i32 %vecext.i } define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) { @@ -7056,28 +7003,27 @@ ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: %0 = bitcast <8 x i64> %__W to <16 x i32> %1 = bitcast i16 %__M to <16 x i1> %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> - %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> - %and.i = and <8 x i32> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x i32> %and.i, <8 x i32> undef, <4 x i32> - %and4.i = and <4 x i32> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x i32> %and4.i, <4 x i32> undef, <4 x i32> - %and7.i = and <4 x i32> %shuffle6.i, %and4.i - %shuffle9.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> - %and10.i = and <4 x i32> %shuffle9.i, %and7.i - %3 = bitcast <4 x i32> %and10.i to <2 x i64> - %vecext.i = extractelement <2 x i64> %3, i32 0 - %conv.i = trunc i64 %vecext.i to i32 - ret i32 %conv.i + %3 = bitcast <16 x i32> %2 to <8 x i64> + %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> + %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> + %and28.i = and <4 x i64> %extract.i, %extract4.i + %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> + %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> + %and729.i = and <2 x i64> %extract5.i, %extract6.i + %and7.i = bitcast <2 x i64> %and729.i to <4 x i32> + %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> + %and8.i = and <4 x i32> %shuffle.i, %and7.i + %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> + %and10.i = and <4 x i32> %shuffle9.i, %and8.i + %vecext.i = extractelement <4 x i32> %and10.i, i32 0 + ret i32 %vecext.i } define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) { @@ -7109,28 +7055,27 @@ ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 -; X64-NEXT: vmovq %xmm0, %rax -; X64-NEXT: # kill: def $eax killed $eax killed $rax +; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: %0 = bitcast <8 x i64> %__W to <16 x i32> %1 = bitcast i16 %__M to <16 x i1> %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer - %shuffle.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x i32> %2, <16 x i32> undef, <8 x i32> - %or.i = or <8 x i32> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x i32> %or.i, <8 x i32> undef, <4 x i32> - %or4.i = or <4 x i32> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x i32> %or4.i, <4 x i32> undef, <4 x i32> - %or7.i = or <4 x i32> %shuffle6.i, %or4.i - %shuffle9.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> - %or10.i = or <4 x i32> %shuffle9.i, %or7.i - %3 = bitcast <4 x i32> %or10.i to <2 x i64> - %vecext.i = extractelement <2 x i64> %3, i32 0 - %conv.i = trunc i64 %vecext.i to i32 - ret i32 %conv.i + %3 = bitcast <16 x i32> %2 to <8 x i64> + %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> + %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> + %or27.i = or <4 x i64> %extract.i, %extract3.i + %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> + %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> + %or628.i = or <2 x i64> %extract4.i, %extract5.i + %or6.i = bitcast <2 x i64> %or628.i to <4 x i32> + %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> + %or7.i = or <4 x i32> %shuffle.i, %or6.i + %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> + %or9.i = or <4 x i32> %shuffle8.i, %or7.i + %vecext.i = extractelement <4 x i32> %or9.i, i32 0 + ret i32 %vecext.i } define double @test_mm512_reduce_add_pd(<8 x double> %__W) { @@ -7147,7 +7092,8 @@ ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -7162,7 +7108,8 @@ ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -7172,7 +7119,7 @@ %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> + %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> %add7.i = fadd <2 x double> %add4.i, %shuffle6.i %vecext.i = extractelement <2 x double> %add7.i, i32 0 ret double %vecext.i @@ -7219,7 +7166,7 @@ %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> + %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i %vecext.i = extractelement <2 x double> %mul7.i, i32 0 ret double %vecext.i @@ -7236,7 +7183,8 @@ ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -7252,21 +7200,25 @@ ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: - %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> - %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> - %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> - %add7.i = fadd <4 x float> %add4.i, %shuffle6.i - %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> - %add10.i = fadd <4 x float> %add7.i, %shuffle9.i - %vecext.i = extractelement <4 x float> %add10.i, i32 0 + %0 = bitcast <16 x float> %__W to <8 x double> + %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> + %1 = bitcast <4 x double> %extract.i to <8 x float> + %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> + %2 = bitcast <4 x double> %extract2.i to <8 x float> + %add.i = fadd <8 x float> %1, %2 + %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> + %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> + %add5.i = fadd <4 x float> %extract3.i, %extract4.i + %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> + %add6.i = fadd <4 x float> %add5.i, %shuffle.i + %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> + %add8.i = fadd <4 x float> %add6.i, %shuffle7.i + %vecext.i = extractelement <4 x float> %add8.i, i32 0 ret float %vecext.i } @@ -7281,7 +7233,7 @@ ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -7298,22 +7250,25 @@ ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: - %shuffle.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x float> %__W, <16 x float> undef, <8 x i32> - %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> - %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> - %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i - %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> - %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i - %vecext.i = extractelement <4 x float> %mul10.i, i32 0 + %0 = bitcast <16 x float> %__W to <8 x double> + %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> + %1 = bitcast <4 x double> %extract.i to <8 x float> + %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> + %2 = bitcast <4 x double> %extract2.i to <8 x float> + %mul.i = fmul <8 x float> %1, %2 + %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> + %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> + %mul5.i = fmul <4 x float> %extract3.i, %extract4.i + %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> + %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i + %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> + %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i + %vecext.i = extractelement <4 x float> %mul8.i, i32 0 ret float %vecext.i } @@ -7334,7 +7289,8 @@ ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X86-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovlpd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -7351,7 +7307,8 @@ ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vhaddpd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -7363,7 +7320,7 @@ %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> + %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> %add7.i = fadd <2 x double> %add4.i, %shuffle6.i %vecext.i = extractelement <2 x double> %add7.i, i32 0 ret double %vecext.i @@ -7419,7 +7376,7 @@ %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> + %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i %vecext.i = extractelement <2 x double> %mul7.i, i32 0 ret double %vecext.i @@ -7438,7 +7395,8 @@ ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -7456,23 +7414,27 @@ ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] +; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: %0 = bitcast i16 %__M to <16 x i1> %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer - %shuffle.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> - %add.i = fadd <8 x float> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> - %add4.i = fadd <4 x float> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x float> %add4.i, <4 x float> undef, <4 x i32> - %add7.i = fadd <4 x float> %add4.i, %shuffle6.i - %shuffle9.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> - %add10.i = fadd <4 x float> %add7.i, %shuffle9.i - %vecext.i = extractelement <4 x float> %add10.i, i32 0 + %2 = bitcast <16 x float> %1 to <8 x double> + %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> + %3 = bitcast <4 x double> %extract.i to <8 x float> + %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> + %4 = bitcast <4 x double> %extract3.i to <8 x float> + %add.i = fadd <8 x float> %3, %4 + %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> + %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> + %add6.i = fadd <4 x float> %extract4.i, %extract5.i + %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> + %add7.i = fadd <4 x float> %add6.i, %shuffle.i + %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> + %add9.i = fadd <4 x float> %add7.i, %shuffle8.i + %vecext.i = extractelement <4 x float> %add9.i, i32 0 ret float %vecext.i } @@ -7490,7 +7452,7 @@ ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X86-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] ; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -7510,23 +7472,26 @@ ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] ; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: %0 = bitcast i16 %__M to <16 x i1> %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> - %shuffle.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> - %shuffle1.i = shufflevector <16 x float> %1, <16 x float> undef, <8 x i32> - %mul.i = fmul <8 x float> %shuffle.i, %shuffle1.i - %shuffle2.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> - %shuffle3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> - %mul4.i = fmul <4 x float> %shuffle2.i, %shuffle3.i - %shuffle6.i = shufflevector <4 x float> %mul4.i, <4 x float> undef, <4 x i32> - %mul7.i = fmul <4 x float> %mul4.i, %shuffle6.i - %shuffle9.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> - %mul10.i = fmul <4 x float> %mul7.i, %shuffle9.i + %2 = bitcast <16 x float> %1 to <8 x double> + %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> + %3 = bitcast <4 x double> %extract.i to <8 x float> + %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> + %4 = bitcast <4 x double> %extract4.i to <8 x float> + %mul.i = fmul <8 x float> %3, %4 + %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> + %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> + %mul7.i = fmul <4 x float> %extract5.i, %extract6.i + %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> + %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i + %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> + %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i %vecext.i = extractelement <4 x float> %mul10.i, i32 0 ret float %vecext.i }