Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14469,7 +14469,7 @@ MaxIndex = std::max(MaxIndex, Index); } - NearestPow2 = PowerOf2Ceil(MaxIndex); + NearestPow2 = PowerOf2Ceil(MaxIndex + 1); if (InVT.isSimple() && (NearestPow2 > 2) && ((NumElems * 2) < NearestPow2)) { unsigned SplitSize = NearestPow2 / 2; Index: test/CodeGen/ARM/crash-on-pow2-shufflevector.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/crash-on-pow2-shufflevector.ll @@ -0,0 +1,63 @@ +; RUN: llc %s -mtriple=armv7--linux-android -o /dev/null +; +; IR is the (slightly reduced) result of running `clang -fprofile-generate +; --target=arm-linux-androideabi -march=armv7-a -O2 -S -emit-llvm ~/tc.c -o -`. +; +; struct desc { +; int bar; +; int pad[7]; +; }; +; +; int foo(struct desc* descs, unsigned num, unsigned cw) { +; int accum = 0; +; for (int c = 0; c < cw; c++) { +; accum += descs[c].bar << (c ? 0 : cw); +; } +; return accum; +; } +; + +%struct.desc = type { i32, [7 x i32] } + +define i32 @foo(%struct.desc* nocapture readonly %descs, i32 %num, i32 %cw) { +for.body.lr.ph: + %0 = alloca i8 + %1 = alloca i8 + br label %vector.ph + +vector.ph: + %n.mod.vf = and i32 %cw, 1 + %2 = icmp eq i32 %n.mod.vf, 0 + %3 = select i1 %2, i32 2, i32 %n.mod.vf + %n.vec = sub i32 %cw, %3 + %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %cw, i32 0 + %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer + br label %vector.body + +vector.body: + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <2 x i64> [ zeroinitializer, %vector.ph ], [ %8, %vector.body ] + %vec.ind = phi <2 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %vec.phi18 = phi <2 x i32> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ] + %4 = getelementptr inbounds %struct.desc, %struct.desc* %descs, i32 %index, i32 0 + %5 = bitcast i32* %4 to <16 x i32>* + %wide.vec = load <16 x i32>, <16 x i32>* %5, align 4 + %strided.vec = shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <2 x i32> + %6 = icmp eq <2 x i32> %vec.ind, zeroinitializer + %7 = zext <2 x i1> %6 to <2 x i64> + %8 = add <2 x i64> %vec.phi, %7 + %9 = select <2 x i1> %6, <2 x i32> %broadcast.splat, <2 x i32> zeroinitializer + %10 = shl <2 x i32> %strided.vec, %9 + %11 = add nsw <2 x i32> %10, %vec.phi18 + %index.next = add i32 %index, 2 + %vec.ind.next = add <2 x i32> %vec.ind, + %12 = icmp eq i32 %index.next, %n.vec + br i1 %12, label %middle.block, label %vector.body + +middle.block: + %rdx.shuf19 = shufflevector <2 x i32> %11, <2 x i32> undef, <2 x i32> undef + br label %for.cond.cleanup + +for.cond.cleanup: + ret i32 undef +} Index: test/CodeGen/X86/avx512-shuffles/partial_permute.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -2682,10 +2682,9 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vmovaps (%rdi), %zmm0 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2696,11 +2695,11 @@ ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # BB#0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; CHECK-NEXT: movb $2, %al ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm1[0] +; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -2713,11 +2712,11 @@ ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # BB#0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; CHECK-NEXT: movb $2, %al ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp @@ -4527,7 +4526,7 @@ ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # BB#0: ; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf32x4 $2, %zmm1, %xmm2 +; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[0] @@ -4543,7 +4542,7 @@ ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1: ; CHECK: # BB#0: ; CHECK-NEXT: vmovapd (%rdi), %zmm0 -; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; CHECK-NEXT: movb $1, %al ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0] Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2685,17 +2685,19 @@ define <2 x double> @test_v8f64_34 (<8 x double> %v) { ; AVX512F-LABEL: test_v8f64_34: ; AVX512F: # BB#0: -; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: test_v8f64_34: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512F-32-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; AVX512F-32-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512F-32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2] +; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512F-32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512F-32-NEXT: vzeroupper ; AVX512F-32-NEXT: retl %res = shufflevector <8 x double> %v, <8 x double> undef, <2 x i32>