diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11489,7 +11489,9 @@ unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); - if (Cost <= 4) + // The cost tables encode cost 0 or cost 1 shuffles using the value 0 in + // the top 2 bits. + if (Cost == 0) return true; } diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll --- a/llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll @@ -7,10 +7,9 @@ define <4 x i16> @f(<4 x i32> %vqdmlal_v3.i, <8 x i16> %x5) { ; CHECK-LABEL: f: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h -; CHECK-NEXT: ext v1.8b, v0.8b, v1.8b, #4 -; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h +; CHECK-NEXT: dup v0.4h, v0.h[4] +; CHECK-NEXT: mov v0.h[1], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: ; Check that we don't just dup the input vector. The code emitted is ext, dup, ext, ext diff --git a/llvm/test/CodeGen/AArch64/build-vector-extract.ll b/llvm/test/CodeGen/AArch64/build-vector-extract.ll --- a/llvm/test/CodeGen/AArch64/build-vector-extract.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-extract.ll @@ -30,9 +30,8 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s -; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 1 %z = zext i32 %e to i64 @@ -57,9 +56,8 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract2_i32_zext_insert0_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: uzp1 v1.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov w8, v0.s[2] +; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 2 %z = zext i32 %e to i64 @@ -110,9 +108,8 @@ define <2 x i64> @extract0_i32_zext_insert1_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract0_i32_zext_insert1_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: dup v0.2d, x8 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 0 %z = zext i32 %e to i64 @@ -137,9 +134,8 @@ define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract1_i32_zext_insert1_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: dup v0.2d, x8 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 1 %z = zext i32 %e to i64 diff --git a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll --- a/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll +++ b/llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll @@ -46,9 +46,11 @@ define <8 x i16> @v8i16_2(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: v8i16_2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev64 v2.4h, v0.4h -; CHECK-NEXT: rev64 v0.4h, v1.4h -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: %V128 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll b/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll --- a/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll @@ -136,11 +136,11 @@ define <8 x i16> @shuffle_widen_faili1(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: shuffle_widen_faili1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev32 v2.4h, v0.4h -; CHECK-NEXT: rev32 v3.4h, v1.4h -; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4 -; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: %res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> @shuffle_widen_fail2(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: shuffle_widen_fail2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h -; CHECK-NEXT: trn1 v3.4h, v1.4h, v1.4h -; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4 -; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: %res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: test_shuf1: ; CHECK: // %bb.0: -; CHECK-NEXT: zip2 v3.4s, v7.4s, v6.4s -; CHECK-NEXT: ext v5.16b, v6.16b, v4.16b, #12 -; CHECK-NEXT: uzp1 v6.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v4.4s, v2.4s, v4.4s -; CHECK-NEXT: trn2 v3.4s, v7.4s, v3.4s -; CHECK-NEXT: ext v5.16b, v7.16b, v5.16b, #8 -; CHECK-NEXT: trn2 v6.4s, v6.4s, v1.4s -; CHECK-NEXT: trn1 v2.4s, v4.4s, v2.4s -; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #12 -; CHECK-NEXT: ext v3.16b, v1.16b, v3.16b, #8 -; CHECK-NEXT: rev64 v16.4s, v5.4s -; CHECK-NEXT: dup v7.4s, v7.s[0] -; CHECK-NEXT: ext v1.16b, v0.16b, v6.16b, #12 -; CHECK-NEXT: mov v2.s[3], v7.s[3] -; CHECK-NEXT: ext v0.16b, v3.16b, v4.16b, #8 -; CHECK-NEXT: ext v3.16b, v5.16b, v16.16b, #8 +; CHECK-NEXT: uzp1 v16.4s, v1.4s, v0.4s +; CHECK-NEXT: ext v3.16b, v6.16b, v4.16b, #12 +; CHECK-NEXT: zip2 v6.4s, v7.4s, v6.4s +; CHECK-NEXT: uzp2 v17.4s, v2.4s, v4.4s +; CHECK-NEXT: trn2 v16.4s, v16.4s, v1.4s +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: trn2 v4.4s, v7.4s, v6.4s +; CHECK-NEXT: rev64 v5.4s, v7.4s +; CHECK-NEXT: trn1 v2.4s, v17.4s, v2.4s +; CHECK-NEXT: dup v6.4s, v7.s[0] +; CHECK-NEXT: mov v4.d[1], v1.d[1] +; CHECK-NEXT: mov v3.d[1], v5.d[1] +; CHECK-NEXT: ext v1.16b, v0.16b, v16.16b, #12 +; CHECK-NEXT: mov v2.s[3], v6.s[3] +; CHECK-NEXT: mov v0.16b, v4.16b ; CHECK-NEXT: ret %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> ret <16 x i32> %s3 @@ -29,10 +28,9 @@ ; CHECK-LABEL: test_shuf2: ; CHECK: // %bb.0: ; CHECK-NEXT: zip2 v0.4s, v7.4s, v6.4s -; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #12 +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4 ; CHECK-NEXT: trn2 v0.4s, v7.4s, v0.4s -; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #8 -; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #8 +; CHECK-NEXT: mov v0.d[1], v1.d[1] ; CHECK-NEXT: ret %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> ret <4 x i32> %s3 @@ -64,10 +62,9 @@ define <4 x i32> @test_shuf5(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: test_shuf5: ; CHECK: // %bb.0: +; CHECK-NEXT: rev64 v1.4s, v7.4s ; CHECK-NEXT: ext v0.16b, v6.16b, v4.16b, #12 -; CHECK-NEXT: ext v0.16b, v7.16b, v0.16b, #8 -; CHECK-NEXT: rev64 v1.4s, v0.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: mov v0.d[1], v1.d[1] ; CHECK-NEXT: ret %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> ret <4 x i32> %s3