Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10205,10 +10205,7 @@ unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + PFIndexes[2] * 9 + PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; - unsigned Cost = (PFEntry >> 30); - - if (Cost <= 4) - return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } return GenerateTBL(Op, ShuffleMask, DAG); @@ -11492,7 +11489,8 @@ unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); - if (Cost <= 4) + // The cost tables encode cost 1 (or 0) shuffles using the value 0. + if (Cost == 0) return true; } Index: llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll +++ llvm/test/CodeGen/AArch64/aarch64-wide-shuffle.ll @@ -7,10 +7,9 @@ define <4 x i16> @f(<4 x i32> %vqdmlal_v3.i, <8 x i16> %x5) { ; CHECK-LABEL: f: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h -; CHECK-NEXT: ext v1.8b, v0.8b, v1.8b, #4 -; CHECK-NEXT: uzp1 v0.4h, v1.4h, v0.4h +; CHECK-NEXT: dup v0.4h, v0.h[4] +; CHECK-NEXT: mov v0.h[1], v1.h[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret entry: ; Check that we don't just dup the input vector. The code emitted is ext, dup, ext, ext Index: llvm/test/CodeGen/AArch64/build-vector-extract.ll =================================================================== --- llvm/test/CodeGen/AArch64/build-vector-extract.ll +++ llvm/test/CodeGen/AArch64/build-vector-extract.ll @@ -30,9 +30,8 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract1_i32_zext_insert0_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s -; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 1 %z = zext i32 %e to i64 @@ -57,9 +56,8 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract2_i32_zext_insert0_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: uzp1 v1.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov w8, v0.s[2] +; CHECK-NEXT: fmov d0, x8 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 2 %z = zext i32 %e to i64 @@ -110,9 +108,8 @@ define <2 x i64> @extract0_i32_zext_insert1_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract0_i32_zext_insert1_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: zip1 v1.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: dup v0.2d, x8 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 0 %z = zext i32 %e to i64 @@ -137,9 +134,8 @@ define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract1_i32_zext_insert1_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: dup v0.2d, x8 ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 1 %z = zext i32 %e to i64 Index: llvm/test/CodeGen/AArch64/insert-extend.ll =================================================================== --- llvm/test/CodeGen/AArch64/insert-extend.ll +++ llvm/test/CodeGen/AArch64/insert-extend.ll @@ -380,117 +380,114 @@ ; CHECK-NEXT: zip2 v5.4s, v1.4s, v5.4s ; CHECK-NEXT: zip1 v21.4s, v1.4s, v17.4s ; CHECK-NEXT: zip1 v22.4s, v19.4s, v18.4s -; CHECK-NEXT: mov v20.d[1], v7.d[1] +; CHECK-NEXT: rev64 v6.4s, v16.4s ; CHECK-NEXT: mov v4.d[1], v5.d[1] +; CHECK-NEXT: mov v20.d[1], v7.d[1] ; CHECK-NEXT: trn2 v5.4s, v1.4s, v21.4s ; CHECK-NEXT: ext v7.16b, v19.16b, v22.16b, #8 ; CHECK-NEXT: zip1 v21.4s, v1.4s, v1.4s ; CHECK-NEXT: rev64 v1.4s, v1.4s -; CHECK-NEXT: rev64 v6.4s, v16.4s +; CHECK-NEXT: ext v16.16b, v16.16b, v16.16b, #4 +; CHECK-NEXT: trn2 v6.4s, v6.4s, v0.4s ; CHECK-NEXT: mov v5.d[1], v7.d[1] ; CHECK-NEXT: ext v7.16b, v21.16b, v17.16b, #4 ; CHECK-NEXT: zip2 v1.4s, v1.4s, v17.4s ; CHECK-NEXT: mov v19.s[3], v18.s[2] ; CHECK-NEXT: zip2 v17.4s, v3.4s, v2.4s -; CHECK-NEXT: ext v16.16b, v16.16b, v16.16b, #4 -; CHECK-NEXT: trn2 v6.4s, v6.4s, v0.4s ; CHECK-NEXT: trn2 v2.4s, v3.4s, v2.4s +; CHECK-NEXT: trn2 v0.4s, v0.4s, v16.4s ; CHECK-NEXT: mov v1.d[1], v19.d[1] ; CHECK-NEXT: mov v22.d[1], v17.d[1] -; CHECK-NEXT: trn2 v0.4s, v0.4s, v16.4s ; CHECK-NEXT: mov v2.d[1], v7.d[1] +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 ; CHECK-NEXT: sub v1.4s, v5.4s, v1.4s ; CHECK-NEXT: sub v3.4s, v20.4s, v22.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: zip2 v5.4s, v1.4s, v3.4s ; CHECK-NEXT: add v2.4s, v4.4s, v2.4s -; CHECK-NEXT: add v17.4s, v20.4s, v22.4s ; CHECK-NEXT: add v4.4s, v6.4s, v0.4s -; CHECK-NEXT: trn2 v5.4s, v1.4s, v5.4s +; CHECK-NEXT: zip2 v5.4s, v1.4s, v3.4s ; CHECK-NEXT: sub v0.4s, v6.4s, v0.4s ; CHECK-NEXT: uzp1 v6.4s, v2.4s, v4.4s -; CHECK-NEXT: ext v16.16b, v2.16b, v2.16b, #12 -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 +; CHECK-NEXT: ext v16.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: trn2 v5.4s, v1.4s, v5.4s ; CHECK-NEXT: mov v7.16b, v0.16b ; CHECK-NEXT: trn2 v6.4s, v6.4s, v2.4s +; CHECK-NEXT: add v17.4s, v20.4s, v22.4s ; CHECK-NEXT: mov v7.d[1], v4.d[1] -; CHECK-NEXT: ext v5.16b, v5.16b, v16.16b, #8 -; CHECK-NEXT: mov v16.16b, v17.16b +; CHECK-NEXT: mov v5.d[1], v16.d[1] ; CHECK-NEXT: ext v4.16b, v4.16b, v6.16b, #12 -; CHECK-NEXT: ext v6.16b, v3.16b, v0.16b, #12 -; CHECK-NEXT: uzp2 v0.4s, v17.4s, v0.4s +; CHECK-NEXT: uzp2 v6.4s, v17.4s, v0.4s +; CHECK-NEXT: mov v16.16b, v17.16b ; CHECK-NEXT: sub v5.4s, v7.4s, v5.4s ; CHECK-NEXT: add v7.4s, v2.4s, v4.4s -; CHECK-NEXT: ext v6.16b, v1.16b, v6.16b, #8 -; CHECK-NEXT: uzp2 v18.4s, v5.4s, v7.4s -; CHECK-NEXT: trn1 v0.4s, v0.4s, v17.4s -; CHECK-NEXT: dup v17.4s, v1.s[0] +; CHECK-NEXT: trn1 v6.4s, v6.4s, v17.4s +; CHECK-NEXT: uzp2 v17.4s, v5.4s, v7.4s +; CHECK-NEXT: dup v18.4s, v1.s[0] +; CHECK-NEXT: ext v0.16b, v3.16b, v0.16b, #12 ; CHECK-NEXT: mov v16.d[1], v3.d[1] -; CHECK-NEXT: ext v3.16b, v18.16b, v7.16b, #4 -; CHECK-NEXT: rev64 v19.4s, v6.4s -; CHECK-NEXT: mov v0.s[3], v17.s[3] -; CHECK-NEXT: rev64 v17.4s, v7.4s -; CHECK-NEXT: uzp2 v3.4s, v3.4s, v18.4s -; CHECK-NEXT: ext v6.16b, v6.16b, v19.16b, #8 -; CHECK-NEXT: sub v18.4s, v16.4s, v0.4s -; CHECK-NEXT: add v0.4s, v16.4s, v0.4s -; CHECK-NEXT: zip2 v17.4s, v17.4s, v5.4s -; CHECK-NEXT: dup v19.4s, v0.s[0] -; CHECK-NEXT: mov v16.16b, v3.16b -; CHECK-NEXT: sub v20.4s, v1.4s, v6.4s -; CHECK-NEXT: mov v16.s[3], v19.s[3] -; CHECK-NEXT: ext v17.16b, v0.16b, v17.16b, #8 +; CHECK-NEXT: rev64 v3.4s, v1.4s +; CHECK-NEXT: ext v19.16b, v17.16b, v7.16b, #4 +; CHECK-NEXT: mov v6.s[3], v18.s[3] ; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s -; CHECK-NEXT: ext v4.16b, v20.16b, v18.16b, #12 +; CHECK-NEXT: mov v0.d[1], v3.d[1] +; CHECK-NEXT: uzp2 v3.4s, v19.4s, v17.4s +; CHECK-NEXT: add v4.4s, v16.4s, v6.4s +; CHECK-NEXT: sub v17.4s, v16.4s, v6.4s +; CHECK-NEXT: dup v18.4s, v4.s[0] +; CHECK-NEXT: mov v16.16b, v3.16b +; CHECK-NEXT: sub v6.4s, v1.4s, v0.4s +; CHECK-NEXT: mov v16.s[3], v18.s[3] +; CHECK-NEXT: ext v18.16b, v6.16b, v17.16b, #12 +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: rev64 v1.4s, v4.4s +; CHECK-NEXT: rev64 v19.4s, v7.4s ; CHECK-NEXT: add v16.4s, v7.4s, v16.4s -; CHECK-NEXT: rev64 v19.4s, v17.4s ; CHECK-NEXT: mov v7.s[2], v2.s[2] -; CHECK-NEXT: add v1.4s, v1.4s, v6.4s -; CHECK-NEXT: trn1 v4.4s, v4.4s, v20.4s -; CHECK-NEXT: ext v6.16b, v17.16b, v19.16b, #8 -; CHECK-NEXT: ext v17.16b, v1.16b, v5.16b, #8 -; CHECK-NEXT: uzp2 v19.4s, v5.4s, v1.4s +; CHECK-NEXT: trn1 v18.4s, v18.4s, v6.4s +; CHECK-NEXT: add v20.4s, v4.4s, v1.4s +; CHECK-NEXT: mov v4.s[1], v17.s[1] +; CHECK-NEXT: ext v17.16b, v0.16b, v5.16b, #8 +; CHECK-NEXT: uzp2 v21.4s, v5.4s, v0.4s +; CHECK-NEXT: zip2 v19.4s, v19.4s, v5.4s ; CHECK-NEXT: dup v2.4s, v2.s[2] ; CHECK-NEXT: sub v3.4s, v7.4s, v3.4s -; CHECK-NEXT: mov v4.s[3], v2.s[3] +; CHECK-NEXT: mov v18.s[3], v2.s[3] ; CHECK-NEXT: ext v2.16b, v16.16b, v3.16b, #12 -; CHECK-NEXT: mov v20.s[2], v1.s[2] -; CHECK-NEXT: add v21.4s, v0.4s, v6.4s -; CHECK-NEXT: mov v0.s[1], v18.s[1] -; CHECK-NEXT: trn1 v3.4s, v19.4s, v17.4s -; CHECK-NEXT: add v4.4s, v5.4s, v4.4s +; CHECK-NEXT: mov v6.s[2], v0.s[2] +; CHECK-NEXT: trn1 v3.4s, v21.4s, v17.4s +; CHECK-NEXT: mov v19.d[1], v1.d[1] +; CHECK-NEXT: add v1.4s, v5.4s, v18.4s ; CHECK-NEXT: ext v5.16b, v2.16b, v16.16b, #4 ; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s -; CHECK-NEXT: sub v3.4s, v20.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-NEXT: sub v4.4s, v4.4s, v19.4s +; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s ; CHECK-NEXT: ext v2.16b, v5.16b, v2.16b, #12 -; CHECK-NEXT: mov v0.s[2], v21.s[2] -; CHECK-NEXT: mov v3.s[1], v1.s[1] -; CHECK-NEXT: movi v1.8h, #1 +; CHECK-NEXT: mov v3.s[1], v0.s[1] +; CHECK-NEXT: mov v4.s[2], v20.s[2] +; CHECK-NEXT: movi v0.8h, #1 ; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff ; CHECK-NEXT: ushr v5.4s, v2.4s, #15 -; CHECK-NEXT: ushr v6.4s, v0.4s, #15 -; CHECK-NEXT: ushr v7.4s, v3.4s, #15 +; CHECK-NEXT: ushr v6.4s, v3.4s, #15 +; CHECK-NEXT: ushr v7.4s, v1.4s, #15 ; CHECK-NEXT: ushr v16.4s, v4.4s, #15 -; CHECK-NEXT: and v5.16b, v5.16b, v1.16b -; CHECK-NEXT: and v7.16b, v7.16b, v1.16b -; CHECK-NEXT: and v6.16b, v6.16b, v1.16b -; CHECK-NEXT: and v1.16b, v16.16b, v1.16b +; CHECK-NEXT: and v5.16b, v5.16b, v0.16b +; CHECK-NEXT: and v6.16b, v6.16b, v0.16b +; CHECK-NEXT: and v16.16b, v16.16b, v0.16b +; CHECK-NEXT: and v0.16b, v7.16b, v0.16b ; CHECK-NEXT: mul v5.4s, v5.4s, v17.4s -; CHECK-NEXT: mul v7.4s, v7.4s, v17.4s ; CHECK-NEXT: mul v6.4s, v6.4s, v17.4s -; CHECK-NEXT: mul v1.4s, v1.4s, v17.4s +; CHECK-NEXT: mul v7.4s, v16.4s, v17.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v17.4s ; CHECK-NEXT: add v2.4s, v5.4s, v2.4s -; CHECK-NEXT: add v3.4s, v7.4s, v3.4s -; CHECK-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-NEXT: add v4.4s, v1.4s, v4.4s +; CHECK-NEXT: add v3.4s, v6.4s, v3.4s +; CHECK-NEXT: add v4.4s, v7.4s, v4.4s +; CHECK-NEXT: add v1.4s, v0.4s, v1.4s ; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b +; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b +; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b +; CHECK-NEXT: eor v1.16b, v4.16b, v7.16b ; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -673,20 +670,20 @@ ; CHECK-NEXT: zip2 v5.4s, v1.4s, v5.4s ; CHECK-NEXT: zip1 v21.4s, v1.4s, v17.4s ; CHECK-NEXT: zip1 v22.4s, v19.4s, v18.4s -; CHECK-NEXT: mov v20.d[1], v7.d[1] +; CHECK-NEXT: rev64 v6.4s, v16.4s ; CHECK-NEXT: mov v4.d[1], v5.d[1] +; CHECK-NEXT: mov v20.d[1], v7.d[1] ; CHECK-NEXT: trn2 v5.4s, v1.4s, v21.4s ; CHECK-NEXT: ext v7.16b, v19.16b, v22.16b, #8 ; CHECK-NEXT: zip1 v21.4s, v1.4s, v1.4s ; CHECK-NEXT: rev64 v1.4s, v1.4s -; CHECK-NEXT: rev64 v6.4s, v16.4s +; CHECK-NEXT: ext v16.16b, v16.16b, v16.16b, #4 +; CHECK-NEXT: trn2 v6.4s, v6.4s, v0.4s ; CHECK-NEXT: mov v5.d[1], v7.d[1] ; CHECK-NEXT: ext v7.16b, v21.16b, v17.16b, #4 -; CHECK-NEXT: ext v16.16b, v16.16b, v16.16b, #4 ; CHECK-NEXT: zip2 v1.4s, v1.4s, v17.4s ; CHECK-NEXT: mov v19.s[3], v18.s[2] ; CHECK-NEXT: zip2 v17.4s, v3.4s, v2.4s -; CHECK-NEXT: trn2 v6.4s, v6.4s, v0.4s ; CHECK-NEXT: trn2 v2.4s, v3.4s, v2.4s ; CHECK-NEXT: trn2 v0.4s, v0.4s, v16.4s ; CHECK-NEXT: mov v1.d[1], v19.d[1] @@ -696,73 +693,70 @@ ; CHECK-NEXT: sub v1.4s, v5.4s, v1.4s ; CHECK-NEXT: sub v3.4s, v20.4s, v22.4s ; CHECK-NEXT: add v2.4s, v4.4s, v2.4s +; CHECK-NEXT: add v4.4s, v6.4s, v0.4s ; CHECK-NEXT: zip2 v5.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v7.4s, v6.4s, v0.4s -; CHECK-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-NEXT: ext v18.16b, v2.16b, v2.16b, #12 -; CHECK-NEXT: trn2 v4.4s, v1.4s, v5.4s -; CHECK-NEXT: uzp1 v6.4s, v2.4s, v0.4s -; CHECK-NEXT: add v5.4s, v20.4s, v22.4s -; CHECK-NEXT: mov v17.16b, v7.16b -; CHECK-NEXT: ext v4.16b, v2.16b, v4.16b, #8 +; CHECK-NEXT: sub v0.4s, v6.4s, v0.4s +; CHECK-NEXT: uzp1 v6.4s, v2.4s, v4.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: trn2 v5.4s, v1.4s, v5.4s +; CHECK-NEXT: mov v7.16b, v0.16b ; CHECK-NEXT: trn2 v6.4s, v6.4s, v2.4s -; CHECK-NEXT: mov v17.d[1], v0.d[1] -; CHECK-NEXT: ext v16.16b, v3.16b, v7.16b, #12 -; CHECK-NEXT: ext v4.16b, v4.16b, v18.16b, #8 -; CHECK-NEXT: ext v0.16b, v0.16b, v6.16b, #12 -; CHECK-NEXT: uzp2 v7.4s, v5.4s, v7.4s -; CHECK-NEXT: mov v19.16b, v5.16b -; CHECK-NEXT: mov v19.d[1], v3.d[1] -; CHECK-NEXT: sub v3.4s, v17.4s, v4.4s -; CHECK-NEXT: add v4.4s, v2.4s, v0.4s -; CHECK-NEXT: trn1 v5.4s, v7.4s, v5.4s -; CHECK-NEXT: uzp2 v7.4s, v3.4s, v4.4s -; CHECK-NEXT: dup v6.4s, v1.s[0] -; CHECK-NEXT: rev64 v17.4s, v4.4s -; CHECK-NEXT: ext v16.16b, v1.16b, v16.16b, #8 -; CHECK-NEXT: mov v5.s[3], v6.s[3] -; CHECK-NEXT: ext v6.16b, v7.16b, v4.16b, #4 -; CHECK-NEXT: zip2 v17.4s, v17.4s, v3.4s -; CHECK-NEXT: rev64 v18.4s, v16.4s -; CHECK-NEXT: add v20.4s, v19.4s, v5.4s -; CHECK-NEXT: uzp2 v6.4s, v6.4s, v7.4s -; CHECK-NEXT: ext v7.16b, v20.16b, v17.16b, #8 -; CHECK-NEXT: ext v16.16b, v16.16b, v18.16b, #8 -; CHECK-NEXT: dup v21.4s, v20.s[0] -; CHECK-NEXT: mov v17.16b, v6.16b -; CHECK-NEXT: rev64 v18.4s, v7.4s -; CHECK-NEXT: mov v17.s[3], v21.s[3] -; CHECK-NEXT: sub v22.4s, v1.4s, v16.4s -; CHECK-NEXT: sub v5.4s, v19.4s, v5.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s -; CHECK-NEXT: ext v7.16b, v7.16b, v18.16b, #8 -; CHECK-NEXT: add v2.4s, v4.4s, v17.4s -; CHECK-NEXT: ext v18.16b, v22.16b, v5.16b, #12 -; CHECK-NEXT: mov v4.s[2], v0.s[2] -; CHECK-NEXT: add v1.4s, v1.4s, v16.4s -; CHECK-NEXT: add v17.4s, v20.4s, v7.4s -; CHECK-NEXT: mov v20.s[1], v5.s[1] -; CHECK-NEXT: trn1 v5.4s, v18.4s, v22.4s -; CHECK-NEXT: ext v16.16b, v1.16b, v3.16b, #8 -; CHECK-NEXT: uzp2 v18.4s, v3.4s, v1.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v6.4s -; CHECK-NEXT: mov v22.s[2], v1.s[2] -; CHECK-NEXT: ext v4.16b, v2.16b, v4.16b, #12 -; CHECK-NEXT: dup v0.4s, v0.s[2] -; CHECK-NEXT: trn1 v6.4s, v18.4s, v16.4s -; CHECK-NEXT: mov v5.s[3], v0.s[3] -; CHECK-NEXT: ext v0.16b, v4.16b, v2.16b, #4 -; CHECK-NEXT: ext v2.16b, v4.16b, v4.16b, #8 -; CHECK-NEXT: add v1.4s, v1.4s, v6.4s -; CHECK-NEXT: sub v4.4s, v22.4s, v6.4s -; CHECK-NEXT: sub v6.4s, v20.4s, v7.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #12 -; CHECK-NEXT: mov v4.s[1], v1.s[1] -; CHECK-NEXT: mov v6.s[2], v17.s[2] -; CHECK-NEXT: add v1.4s, v3.4s, v5.4s -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: add v1.4s, v1.4s, v6.4s +; CHECK-NEXT: mov v7.d[1], v4.d[1] +; CHECK-NEXT: mov v5.d[1], v16.d[1] +; CHECK-NEXT: add v17.4s, v20.4s, v22.4s +; CHECK-NEXT: ext v4.16b, v4.16b, v6.16b, #12 +; CHECK-NEXT: uzp2 v6.4s, v17.4s, v0.4s +; CHECK-NEXT: sub v5.4s, v7.4s, v5.4s +; CHECK-NEXT: mov v16.16b, v17.16b +; CHECK-NEXT: add v7.4s, v2.4s, v4.4s +; CHECK-NEXT: trn1 v6.4s, v6.4s, v17.4s +; CHECK-NEXT: uzp2 v18.4s, v5.4s, v7.4s +; CHECK-NEXT: dup v17.4s, v1.s[0] +; CHECK-NEXT: ext v0.16b, v3.16b, v0.16b, #12 +; CHECK-NEXT: mov v16.d[1], v3.d[1] +; CHECK-NEXT: ext v3.16b, v18.16b, v7.16b, #4 +; CHECK-NEXT: mov v6.s[3], v17.s[3] +; CHECK-NEXT: rev64 v19.4s, v1.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: uzp2 v3.4s, v3.4s, v18.4s +; CHECK-NEXT: add v17.4s, v16.4s, v6.4s +; CHECK-NEXT: mov v0.d[1], v19.d[1] +; CHECK-NEXT: dup v18.4s, v17.s[0] +; CHECK-NEXT: sub v6.4s, v16.4s, v6.4s +; CHECK-NEXT: mov v16.16b, v3.16b +; CHECK-NEXT: mov v16.s[3], v18.s[3] +; CHECK-NEXT: sub v18.4s, v1.4s, v0.4s +; CHECK-NEXT: rev64 v4.4s, v17.4s +; CHECK-NEXT: rev64 v19.4s, v7.4s +; CHECK-NEXT: add v16.4s, v7.4s, v16.4s +; CHECK-NEXT: ext v21.16b, v18.16b, v6.16b, #12 +; CHECK-NEXT: mov v7.s[2], v2.s[2] ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v20.4s, v17.4s, v4.4s +; CHECK-NEXT: mov v17.s[1], v6.s[1] +; CHECK-NEXT: trn1 v1.4s, v21.4s, v18.4s +; CHECK-NEXT: ext v6.16b, v0.16b, v5.16b, #8 +; CHECK-NEXT: uzp2 v21.4s, v5.4s, v0.4s +; CHECK-NEXT: zip2 v19.4s, v19.4s, v5.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v3.4s +; CHECK-NEXT: mov v18.s[2], v0.s[2] +; CHECK-NEXT: ext v3.16b, v16.16b, v3.16b, #12 +; CHECK-NEXT: trn1 v6.4s, v21.4s, v6.4s +; CHECK-NEXT: dup v2.4s, v2.s[2] +; CHECK-NEXT: mov v19.d[1], v4.d[1] +; CHECK-NEXT: mov v1.s[3], v2.s[3] +; CHECK-NEXT: ext v2.16b, v3.16b, v16.16b, #4 +; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NEXT: add v0.4s, v0.4s, v6.4s +; CHECK-NEXT: sub v4.4s, v17.4s, v19.4s +; CHECK-NEXT: sub v6.4s, v18.4s, v6.4s +; CHECK-NEXT: ext v2.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: mov v6.s[1], v0.s[1] +; CHECK-NEXT: mov v4.s[2], v20.s[2] +; CHECK-NEXT: add v0.4s, v5.4s, v1.4s +; CHECK-NEXT: add v1.4s, v2.4s, v6.4s +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 Index: llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll +++ llvm/test/CodeGen/AArch64/neon-reverseshuffle.ll @@ -46,9 +46,11 @@ define <8 x i16> @v8i16_2(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: v8i16_2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev64 v2.4h, v0.4h -; CHECK-NEXT: rev64 v0.4h, v1.4h -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: %V128 = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> Index: llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll =================================================================== --- llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll +++ llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll @@ -136,11 +136,11 @@ define <8 x i16> @shuffle_widen_faili1(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: shuffle_widen_faili1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: rev32 v2.4h, v0.4h -; CHECK-NEXT: rev32 v3.4h, v1.4h -; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4 -; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: %res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> @shuffle_widen_fail2(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: shuffle_widen_fail2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 v2.4h, v0.4h, v0.4h -; CHECK-NEXT: trn1 v3.4h, v1.4h, v1.4h -; CHECK-NEXT: ext v1.8b, v2.8b, v1.8b, #4 -; CHECK-NEXT: ext v0.8b, v3.8b, v0.8b, #4 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b ; CHECK-NEXT: ret entry: %res = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> %x, <16 x i32> %y, <16 x i32> %s2 = shufflevector <16 x i32> %y, <16 x i32> %x, <16 x i32> @@ -51,41 +50,40 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: uzp2 v16.4s, v0.4s, v5.4s ; CHECK-NEXT: dup v4.4s, v6.s[0] -; CHECK-NEXT: rev64 v18.4s, v5.4s -; CHECK-NEXT: ext v19.16b, v7.16b, v0.16b, #8 -; CHECK-NEXT: ext v20.16b, v3.16b, v2.16b, #12 +; CHECK-NEXT: ext v18.16b, v7.16b, v0.16b, #8 +; CHECK-NEXT: uzp2 v19.4s, v0.4s, v7.4s +; CHECK-NEXT: rev64 v21.4s, v6.4s ; CHECK-NEXT: ext v17.16b, v16.16b, v5.16b, #4 +; CHECK-NEXT: ext v20.16b, v3.16b, v2.16b, #12 ; CHECK-NEXT: dup v22.4s, v1.s[2] -; CHECK-NEXT: trn1 v20.4s, v20.4s, v3.4s +; CHECK-NEXT: trn1 v18.4s, v19.4s, v18.4s ; CHECK-NEXT: uzp2 v16.4s, v17.4s, v16.4s -; CHECK-NEXT: zip2 v17.4s, v18.4s, v0.4s +; CHECK-NEXT: trn1 v20.4s, v20.4s, v3.4s ; CHECK-NEXT: mov v3.s[2], v7.s[2] +; CHECK-NEXT: mov v17.16b, v16.16b +; CHECK-NEXT: mov v17.s[3], v4.s[3] +; CHECK-NEXT: rev64 v4.4s, v5.4s ; CHECK-NEXT: mov v20.s[3], v22.s[3] -; CHECK-NEXT: mov v18.16b, v16.16b -; CHECK-NEXT: ext v17.16b, v6.16b, v17.16b, #8 -; CHECK-NEXT: mov v18.s[3], v4.s[3] -; CHECK-NEXT: uzp2 v4.4s, v0.4s, v7.4s -; CHECK-NEXT: add v0.4s, v0.4s, v20.4s -; CHECK-NEXT: rev64 v21.4s, v17.4s -; CHECK-NEXT: add v18.4s, v5.4s, v18.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v18.4s +; CHECK-NEXT: add v17.4s, v5.4s, v17.4s ; CHECK-NEXT: mov v5.s[2], v1.s[2] -; CHECK-NEXT: trn1 v4.4s, v4.4s, v19.4s -; CHECK-NEXT: ext v1.16b, v17.16b, v21.16b, #8 -; CHECK-NEXT: sub v5.4s, v5.4s, v16.4s -; CHECK-NEXT: add v7.4s, v7.4s, v4.4s -; CHECK-NEXT: ext v5.16b, v18.16b, v5.16b, #12 -; CHECK-NEXT: add v16.4s, v6.4s, v1.4s +; CHECK-NEXT: zip2 v4.4s, v4.4s, v0.4s +; CHECK-NEXT: add v1.4s, v6.4s, v21.4s ; CHECK-NEXT: mov v6.s[1], v2.s[1] -; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s -; CHECK-NEXT: ext v2.16b, v5.16b, v18.16b, #4 -; CHECK-NEXT: ext v4.16b, v5.16b, v5.16b, #8 -; CHECK-NEXT: sub v1.4s, v6.4s, v1.4s -; CHECK-NEXT: mov v3.s[1], v7.s[1] -; CHECK-NEXT: mov v1.s[2], v16.s[2] -; CHECK-NEXT: ext v2.16b, v2.16b, v4.16b, #12 +; CHECK-NEXT: sub v5.4s, v5.4s, v16.4s +; CHECK-NEXT: mov v4.d[1], v21.d[1] +; CHECK-NEXT: ext v2.16b, v17.16b, v5.16b, #12 +; CHECK-NEXT: add v5.4s, v7.4s, v18.4s +; CHECK-NEXT: add v0.4s, v0.4s, v20.4s +; CHECK-NEXT: sub v4.4s, v6.4s, v4.4s +; CHECK-NEXT: ext v7.16b, v2.16b, v17.16b, #4 +; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: mov v3.s[1], v5.s[1] +; CHECK-NEXT: mov v4.s[2], v1.s[2] +; CHECK-NEXT: ext v2.16b, v7.16b, v2.16b, #12 +; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: add v1.4s, v2.4s, v3.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -104,44 +102,43 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: add v0.4s, v0.4s, v4.4s ; CHECK-NEXT: sub v4.4s, v1.4s, v5.4s -; CHECK-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-NEXT: uzp2 v16.4s, v0.4s, v4.4s -; CHECK-NEXT: rev64 v5.4s, v4.4s ; CHECK-NEXT: add v17.4s, v2.4s, v6.4s +; CHECK-NEXT: uzp2 v16.4s, v0.4s, v4.4s +; CHECK-NEXT: add v1.4s, v1.4s, v5.4s ; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s -; CHECK-NEXT: ext v19.16b, v16.16b, v4.16b, #4 -; CHECK-NEXT: zip2 v5.4s, v5.4s, v0.4s -; CHECK-NEXT: add v18.4s, v3.4s, v7.4s +; CHECK-NEXT: add v19.4s, v3.4s, v7.4s +; CHECK-NEXT: ext v18.16b, v16.16b, v4.16b, #4 +; CHECK-NEXT: dup v6.4s, v2.s[0] ; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s -; CHECK-NEXT: uzp2 v6.4s, v19.4s, v16.4s -; CHECK-NEXT: dup v16.4s, v2.s[0] -; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 +; CHECK-NEXT: ext v21.16b, v19.16b, v17.16b, #12 +; CHECK-NEXT: uzp2 v5.4s, v18.4s, v16.4s +; CHECK-NEXT: rev64 v16.4s, v4.4s +; CHECK-NEXT: rev64 v18.4s, v2.4s ; CHECK-NEXT: ext v7.16b, v3.16b, v0.16b, #8 -; CHECK-NEXT: mov v6.s[3], v16.s[3] -; CHECK-NEXT: uzp2 v16.4s, v0.4s, v3.4s -; CHECK-NEXT: rev64 v21.4s, v5.4s -; CHECK-NEXT: ext v20.16b, v18.16b, v17.16b, #12 -; CHECK-NEXT: add v19.4s, v4.4s, v6.4s +; CHECK-NEXT: mov v5.s[3], v6.s[3] +; CHECK-NEXT: zip2 v16.4s, v16.4s, v0.4s +; CHECK-NEXT: uzp2 v6.4s, v0.4s, v3.4s +; CHECK-NEXT: trn1 v21.4s, v21.4s, v19.4s +; CHECK-NEXT: add v20.4s, v4.4s, v5.4s ; CHECK-NEXT: mov v4.s[2], v1.s[2] -; CHECK-NEXT: ext v5.16b, v5.16b, v21.16b, #8 -; CHECK-NEXT: trn1 v20.4s, v20.4s, v18.4s -; CHECK-NEXT: trn1 v7.4s, v16.4s, v7.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v6.4s -; CHECK-NEXT: mov v18.s[2], v3.s[2] -; CHECK-NEXT: ext v4.16b, v19.16b, v4.16b, #12 -; CHECK-NEXT: add v6.4s, v2.4s, v5.4s +; CHECK-NEXT: mov v16.d[1], v18.d[1] +; CHECK-NEXT: trn1 v6.4s, v6.4s, v7.4s +; CHECK-NEXT: mov v19.s[2], v3.s[2] +; CHECK-NEXT: sub v4.4s, v4.4s, v5.4s +; CHECK-NEXT: add v5.4s, v2.4s, v16.4s +; CHECK-NEXT: ext v4.16b, v20.16b, v4.16b, #12 ; CHECK-NEXT: mov v2.s[1], v17.s[1] ; CHECK-NEXT: dup v1.4s, v1.s[2] -; CHECK-NEXT: add v16.4s, v3.4s, v7.4s -; CHECK-NEXT: sub v3.4s, v18.4s, v7.4s -; CHECK-NEXT: ext v7.16b, v4.16b, v19.16b, #4 +; CHECK-NEXT: add v7.4s, v3.4s, v6.4s +; CHECK-NEXT: sub v3.4s, v19.4s, v6.4s +; CHECK-NEXT: ext v6.16b, v4.16b, v20.16b, #4 ; CHECK-NEXT: ext v4.16b, v4.16b, v4.16b, #8 -; CHECK-NEXT: mov v20.s[3], v1.s[3] -; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s -; CHECK-NEXT: mov v3.s[1], v16.s[1] -; CHECK-NEXT: ext v1.16b, v7.16b, v4.16b, #12 -; CHECK-NEXT: mov v2.s[2], v6.s[2] -; CHECK-NEXT: add v0.4s, v0.4s, v20.4s +; CHECK-NEXT: mov v21.s[3], v1.s[3] +; CHECK-NEXT: sub v2.4s, v2.4s, v16.4s +; CHECK-NEXT: mov v3.s[1], v7.s[1] +; CHECK-NEXT: ext v1.16b, v6.16b, v4.16b, #12 +; CHECK-NEXT: mov v2.s[2], v5.s[2] +; CHECK-NEXT: add v0.4s, v0.4s, v21.4s ; CHECK-NEXT: ret %x = add nsw <16 x i32> %a1, %a2 %y = sub nsw <16 x i32> %a1, %a2 @@ -156,64 +153,61 @@ define <16 x i32> @test1mores(<16 x i32> %a1, <16 x i32> %a2) { ; CHECK-LABEL: test1mores: ; CHECK: // %bb.0: -; CHECK-NEXT: zip2 v7.4s, v3.4s, v2.4s -; CHECK-NEXT: dup v19.4s, v3.s[0] -; CHECK-NEXT: uzp1 v18.4s, v5.4s, v4.4s -; CHECK-NEXT: ext v1.16b, v5.16b, v5.16b, #12 -; CHECK-NEXT: uzp2 v17.4s, v6.4s, v0.4s -; CHECK-NEXT: trn2 v7.4s, v3.4s, v7.4s -; CHECK-NEXT: trn2 v18.4s, v18.4s, v5.4s -; CHECK-NEXT: ext v16.16b, v2.16b, v0.16b, #12 +; CHECK-NEXT: uzp1 v7.4s, v5.4s, v4.4s +; CHECK-NEXT: zip2 v16.4s, v3.4s, v2.4s +; CHECK-NEXT: ext v1.16b, v5.16b, v5.16b, #4 +; CHECK-NEXT: ext v17.16b, v2.16b, v0.16b, #12 +; CHECK-NEXT: trn2 v7.4s, v7.4s, v5.4s +; CHECK-NEXT: trn2 v16.4s, v3.4s, v16.4s +; CHECK-NEXT: uzp2 v18.4s, v6.4s, v0.4s ; CHECK-NEXT: mov v0.d[1], v4.d[1] -; CHECK-NEXT: ext v7.16b, v5.16b, v7.16b, #8 -; CHECK-NEXT: trn1 v17.4s, v17.4s, v6.4s +; CHECK-NEXT: rev64 v19.4s, v3.4s +; CHECK-NEXT: mov v16.d[1], v1.d[1] +; CHECK-NEXT: ext v1.16b, v4.16b, v7.16b, #12 +; CHECK-NEXT: trn1 v7.4s, v18.4s, v6.4s +; CHECK-NEXT: dup v4.4s, v3.s[0] +; CHECK-NEXT: add v0.4s, v0.4s, v16.4s +; CHECK-NEXT: sub v16.4s, v5.4s, v1.4s +; CHECK-NEXT: mov v17.d[1], v19.d[1] +; CHECK-NEXT: uzp2 v18.4s, v0.4s, v16.4s ; CHECK-NEXT: mov v6.d[1], v2.d[1] -; CHECK-NEXT: ext v2.16b, v4.16b, v18.16b, #12 -; CHECK-NEXT: ext v1.16b, v7.16b, v1.16b, #8 -; CHECK-NEXT: ext v16.16b, v3.16b, v16.16b, #8 -; CHECK-NEXT: mov v17.s[3], v19.s[3] -; CHECK-NEXT: sub v7.4s, v5.4s, v2.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: rev64 v4.4s, v16.4s -; CHECK-NEXT: uzp2 v1.4s, v0.4s, v7.4s -; CHECK-NEXT: add v2.4s, v5.4s, v2.4s -; CHECK-NEXT: ext v4.16b, v16.16b, v4.16b, #8 -; CHECK-NEXT: add v16.4s, v6.4s, v17.4s -; CHECK-NEXT: ext v5.16b, v1.16b, v7.16b, #4 -; CHECK-NEXT: sub v6.4s, v6.4s, v17.4s -; CHECK-NEXT: rev64 v17.4s, v7.4s -; CHECK-NEXT: dup v18.4s, v6.s[0] -; CHECK-NEXT: uzp2 v1.4s, v5.4s, v1.4s -; CHECK-NEXT: add v5.4s, v3.4s, v4.4s -; CHECK-NEXT: zip2 v17.4s, v17.4s, v0.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s -; CHECK-NEXT: mov v1.s[3], v18.s[3] -; CHECK-NEXT: ext v4.16b, v3.16b, v0.16b, #8 -; CHECK-NEXT: ext v17.16b, v6.16b, v17.16b, #8 +; CHECK-NEXT: mov v7.s[3], v4.s[3] +; CHECK-NEXT: add v2.4s, v3.4s, v17.4s +; CHECK-NEXT: ext v4.16b, v18.16b, v16.16b, #4 +; CHECK-NEXT: sub v3.4s, v3.4s, v17.4s +; CHECK-NEXT: add v17.4s, v6.4s, v7.4s +; CHECK-NEXT: sub v6.4s, v6.4s, v7.4s +; CHECK-NEXT: uzp2 v4.4s, v4.4s, v18.4s +; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: dup v5.4s, v6.s[0] +; CHECK-NEXT: rev64 v19.4s, v6.4s +; CHECK-NEXT: mov v4.s[3], v5.s[3] +; CHECK-NEXT: rev64 v5.4s, v16.4s +; CHECK-NEXT: ext v21.16b, v2.16b, v17.16b, #12 +; CHECK-NEXT: ext v7.16b, v3.16b, v0.16b, #8 +; CHECK-NEXT: add v20.4s, v16.4s, v4.4s +; CHECK-NEXT: zip2 v5.4s, v5.4s, v0.4s +; CHECK-NEXT: mov v16.s[2], v1.s[2] ; CHECK-NEXT: uzp2 v18.4s, v0.4s, v3.4s -; CHECK-NEXT: add v19.4s, v7.4s, v1.4s -; CHECK-NEXT: mov v7.s[2], v2.s[2] -; CHECK-NEXT: rev64 v21.4s, v17.4s -; CHECK-NEXT: ext v20.16b, v5.16b, v16.16b, #12 -; CHECK-NEXT: trn1 v4.4s, v18.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v7.4s, v1.4s -; CHECK-NEXT: ext v17.16b, v17.16b, v21.16b, #8 -; CHECK-NEXT: trn1 v20.4s, v20.4s, v5.4s -; CHECK-NEXT: mov v5.s[2], v3.s[2] -; CHECK-NEXT: ext v1.16b, v19.16b, v1.16b, #12 -; CHECK-NEXT: add v7.4s, v6.4s, v17.4s -; CHECK-NEXT: mov v6.s[1], v16.s[1] -; CHECK-NEXT: dup v2.4s, v2.s[2] -; CHECK-NEXT: add v18.4s, v3.4s, v4.4s -; CHECK-NEXT: sub v3.4s, v5.4s, v4.4s -; CHECK-NEXT: ext v4.16b, v1.16b, v19.16b, #4 -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: mov v20.s[3], v2.s[3] -; CHECK-NEXT: sub v2.4s, v6.4s, v17.4s -; CHECK-NEXT: mov v3.s[1], v18.s[1] -; CHECK-NEXT: ext v1.16b, v4.16b, v1.16b, #12 -; CHECK-NEXT: mov v2.s[2], v7.s[2] -; CHECK-NEXT: add v0.4s, v0.4s, v20.4s +; CHECK-NEXT: dup v1.4s, v1.s[2] +; CHECK-NEXT: mov v5.d[1], v19.d[1] +; CHECK-NEXT: sub v4.4s, v16.4s, v4.4s +; CHECK-NEXT: trn1 v19.4s, v21.4s, v2.4s +; CHECK-NEXT: ext v4.16b, v20.16b, v4.16b, #12 +; CHECK-NEXT: trn1 v7.4s, v18.4s, v7.4s +; CHECK-NEXT: add v16.4s, v6.4s, v5.4s +; CHECK-NEXT: mov v2.s[2], v3.s[2] +; CHECK-NEXT: mov v6.s[1], v17.s[1] +; CHECK-NEXT: mov v19.s[3], v1.s[3] +; CHECK-NEXT: ext v1.16b, v4.16b, v20.16b, #4 +; CHECK-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; CHECK-NEXT: add v17.4s, v3.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v2.4s, v7.4s +; CHECK-NEXT: sub v2.4s, v6.4s, v5.4s +; CHECK-NEXT: ext v1.16b, v1.16b, v4.16b, #12 +; CHECK-NEXT: mov v3.s[1], v17.s[1] +; CHECK-NEXT: mov v2.s[2], v16.s[2] +; CHECK-NEXT: add v0.4s, v0.4s, v19.4s ; CHECK-NEXT: ret %s3 = shufflevector <16 x i32> %a1, <16 x i32> %a2, <16 x i32> %s4 = shufflevector <16 x i32> %a2, <16 x i32> %a1, <16 x i32> @@ -232,37 +226,35 @@ define <16 x i32> @test2_1(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: test2_1: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v7.16b, v2.16b, v0.16b, #12 -; CHECK-NEXT: uzp1 v17.4s, v5.4s, v4.4s -; CHECK-NEXT: uzp2 v18.4s, v6.4s, v0.4s -; CHECK-NEXT: mov v0.d[1], v4.d[1] -; CHECK-NEXT: ext v7.16b, v3.16b, v7.16b, #8 -; CHECK-NEXT: trn2 v17.4s, v17.4s, v5.4s -; CHECK-NEXT: trn1 v18.4s, v18.4s, v6.4s -; CHECK-NEXT: ext v1.16b, v5.16b, v5.16b, #12 -; CHECK-NEXT: rev64 v16.4s, v7.4s -; CHECK-NEXT: ext v4.16b, v4.16b, v17.16b, #12 -; CHECK-NEXT: ext v7.16b, v7.16b, v16.16b, #8 -; CHECK-NEXT: zip2 v16.4s, v3.4s, v2.4s +; CHECK-NEXT: uzp2 v16.4s, v6.4s, v0.4s +; CHECK-NEXT: ext v17.16b, v2.16b, v0.16b, #12 +; CHECK-NEXT: rev64 v18.4s, v3.4s +; CHECK-NEXT: uzp1 v19.4s, v5.4s, v4.4s +; CHECK-NEXT: trn1 v16.4s, v16.4s, v6.4s +; CHECK-NEXT: zip2 v7.4s, v3.4s, v2.4s +; CHECK-NEXT: mov v17.d[1], v18.d[1] +; CHECK-NEXT: dup v18.4s, v3.s[0] ; CHECK-NEXT: mov v2.s[0], v6.s[0] -; CHECK-NEXT: add v6.4s, v6.4s, v18.4s -; CHECK-NEXT: add v17.4s, v3.4s, v7.4s -; CHECK-NEXT: trn2 v16.4s, v3.4s, v16.4s -; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s -; CHECK-NEXT: dup v3.4s, v3.s[0] -; CHECK-NEXT: ext v7.16b, v17.16b, v7.16b, #12 -; CHECK-NEXT: ext v16.16b, v5.16b, v16.16b, #8 -; CHECK-NEXT: mov v18.s[3], v3.s[3] -; CHECK-NEXT: add v3.4s, v5.4s, v4.4s -; CHECK-NEXT: ext v17.16b, v7.16b, v17.16b, #4 -; CHECK-NEXT: ext v7.16b, v7.16b, v7.16b, #8 -; CHECK-NEXT: ext v16.16b, v16.16b, v1.16b, #8 +; CHECK-NEXT: add v6.4s, v6.4s, v16.4s +; CHECK-NEXT: trn2 v19.4s, v19.4s, v5.4s +; CHECK-NEXT: mov v16.s[3], v18.s[3] +; CHECK-NEXT: add v18.4s, v3.4s, v17.4s +; CHECK-NEXT: sub v17.4s, v3.4s, v17.4s +; CHECK-NEXT: trn2 v7.4s, v3.4s, v7.4s +; CHECK-NEXT: ext v3.16b, v18.16b, v17.16b, #12 +; CHECK-NEXT: ext v1.16b, v5.16b, v5.16b, #4 +; CHECK-NEXT: mov v0.d[1], v4.d[1] +; CHECK-NEXT: ext v4.16b, v4.16b, v19.16b, #12 +; CHECK-NEXT: ext v17.16b, v3.16b, v18.16b, #4 +; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NEXT: mov v7.d[1], v1.d[1] +; CHECK-NEXT: add v18.4s, v5.4s, v4.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v16.4s ; CHECK-NEXT: sub v1.4s, v5.4s, v4.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v18.4s -; CHECK-NEXT: mov v1.s[2], v3.s[2] -; CHECK-NEXT: ext v3.16b, v17.16b, v7.16b, #12 +; CHECK-NEXT: ext v3.16b, v17.16b, v3.16b, #12 +; CHECK-NEXT: mov v1.s[2], v18.s[2] ; CHECK-NEXT: mov v2.s[1], v6.s[1] -; CHECK-NEXT: add v0.4s, v0.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v7.4s ; CHECK-NEXT: ret %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> %s2 = shufflevector <16 x i32> %y, <16 x i32> %x, <16 x i32> @@ -275,43 +267,41 @@ define <16 x i32> @test2_1_ins(<16 x i32> %x1, <16 x i32> %x2) { ; CHECK-LABEL: test2_1_ins: ; CHECK: // %bb.0: -; CHECK-NEXT: add v16.4s, v2.4s, v6.4s -; CHECK-NEXT: add v17.4s, v0.4s, v4.4s ; CHECK-NEXT: add v3.4s, v3.4s, v7.4s -; CHECK-NEXT: ext v18.16b, v16.16b, v17.16b, #12 +; CHECK-NEXT: add v7.4s, v0.4s, v4.4s +; CHECK-NEXT: add v16.4s, v2.4s, v6.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s ; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s +; CHECK-NEXT: uzp2 v5.4s, v2.4s, v7.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s -; CHECK-NEXT: zip2 v5.4s, v3.4s, v16.4s -; CHECK-NEXT: ext v7.16b, v3.16b, v18.16b, #8 +; CHECK-NEXT: ext v4.16b, v16.16b, v7.16b, #12 +; CHECK-NEXT: rev64 v17.4s, v3.4s +; CHECK-NEXT: trn1 v5.4s, v5.4s, v2.4s +; CHECK-NEXT: zip2 v6.4s, v3.4s, v16.4s ; CHECK-NEXT: uzp1 v18.4s, v1.4s, v0.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s -; CHECK-NEXT: trn2 v5.4s, v3.4s, v5.4s -; CHECK-NEXT: rev64 v4.4s, v7.4s -; CHECK-NEXT: uzp2 v6.4s, v2.4s, v17.4s -; CHECK-NEXT: trn2 v18.4s, v18.4s, v1.4s -; CHECK-NEXT: mov v17.d[1], v0.d[1] -; CHECK-NEXT: ext v4.16b, v7.16b, v4.16b, #8 -; CHECK-NEXT: ext v7.16b, v1.16b, v1.16b, #12 -; CHECK-NEXT: trn1 v6.4s, v6.4s, v2.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v18.16b, #12 -; CHECK-NEXT: ext v5.16b, v1.16b, v5.16b, #8 -; CHECK-NEXT: add v18.4s, v3.4s, v4.4s -; CHECK-NEXT: sub v4.4s, v3.4s, v4.4s -; CHECK-NEXT: dup v3.4s, v3.s[0] -; CHECK-NEXT: ext v4.16b, v18.16b, v4.16b, #12 +; CHECK-NEXT: mov v4.d[1], v17.d[1] ; CHECK-NEXT: mov v16.s[0], v2.s[0] -; CHECK-NEXT: ext v5.16b, v5.16b, v7.16b, #8 -; CHECK-NEXT: add v7.4s, v2.4s, v6.4s -; CHECK-NEXT: mov v6.s[3], v3.s[3] -; CHECK-NEXT: ext v3.16b, v4.16b, v18.16b, #4 -; CHECK-NEXT: ext v4.16b, v4.16b, v4.16b, #8 -; CHECK-NEXT: add v18.4s, v1.4s, v0.4s -; CHECK-NEXT: sub v2.4s, v16.4s, v6.4s +; CHECK-NEXT: add v19.4s, v2.4s, v5.4s +; CHECK-NEXT: dup v2.4s, v3.s[0] +; CHECK-NEXT: trn2 v6.4s, v3.4s, v6.4s +; CHECK-NEXT: mov v5.s[3], v2.s[3] +; CHECK-NEXT: trn2 v2.4s, v18.4s, v1.4s +; CHECK-NEXT: add v17.4s, v3.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s +; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: ext v3.16b, v17.16b, v3.16b, #12 +; CHECK-NEXT: mov v7.d[1], v0.d[1] +; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #12 +; CHECK-NEXT: mov v6.d[1], v4.d[1] +; CHECK-NEXT: ext v4.16b, v3.16b, v17.16b, #4 +; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8 +; CHECK-NEXT: add v17.4s, v1.4s, v0.4s +; CHECK-NEXT: sub v2.4s, v16.4s, v5.4s ; CHECK-NEXT: sub v1.4s, v1.4s, v0.4s -; CHECK-NEXT: ext v3.16b, v3.16b, v4.16b, #12 -; CHECK-NEXT: mov v1.s[2], v18.s[2] -; CHECK-NEXT: mov v2.s[1], v7.s[1] -; CHECK-NEXT: add v0.4s, v17.4s, v5.4s +; CHECK-NEXT: ext v3.16b, v4.16b, v3.16b, #12 +; CHECK-NEXT: mov v1.s[2], v17.s[2] +; CHECK-NEXT: mov v2.s[1], v19.s[1] +; CHECK-NEXT: add v0.4s, v7.4s, v6.4s ; CHECK-NEXT: ret %x = add nsw <16 x i32> %x1, %x2 %y = sub nsw <16 x i32> %x1, %x2 @@ -326,48 +316,45 @@ define <16 x i32> @test2_2(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: test2_2: ; CHECK: // %bb.0: +; CHECK-NEXT: uzp2 v16.4s, v6.4s, v0.4s +; CHECK-NEXT: rev64 v7.4s, v3.4s +; CHECK-NEXT: ext v17.16b, v2.16b, v0.16b, #12 ; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s -; CHECK-NEXT: dup v19.4s, v3.s[0] -; CHECK-NEXT: uzp1 v7.4s, v5.4s, v4.4s -; CHECK-NEXT: uzp2 v17.4s, v6.4s, v0.4s -; CHECK-NEXT: ext v16.16b, v2.16b, v0.16b, #12 -; CHECK-NEXT: trn2 v18.4s, v3.4s, v18.4s -; CHECK-NEXT: trn2 v7.4s, v7.4s, v5.4s -; CHECK-NEXT: trn1 v17.4s, v17.4s, v6.4s -; CHECK-NEXT: ext v1.16b, v5.16b, v5.16b, #12 -; CHECK-NEXT: ext v18.16b, v5.16b, v18.16b, #8 +; CHECK-NEXT: uzp1 v19.4s, v5.4s, v4.4s +; CHECK-NEXT: trn1 v16.4s, v16.4s, v6.4s +; CHECK-NEXT: mov v17.d[1], v7.d[1] +; CHECK-NEXT: dup v7.4s, v3.s[0] +; CHECK-NEXT: ext v1.16b, v5.16b, v5.16b, #4 +; CHECK-NEXT: trn2 v19.4s, v19.4s, v5.4s +; CHECK-NEXT: mov v16.s[3], v7.s[3] +; CHECK-NEXT: trn2 v7.4s, v3.4s, v18.4s ; CHECK-NEXT: mov v0.d[1], v4.d[1] -; CHECK-NEXT: ext v16.16b, v3.16b, v16.16b, #8 -; CHECK-NEXT: mov v17.s[3], v19.s[3] -; CHECK-NEXT: ext v4.16b, v4.16b, v7.16b, #12 -; CHECK-NEXT: ext v1.16b, v18.16b, v1.16b, #8 -; CHECK-NEXT: rev64 v19.4s, v16.4s -; CHECK-NEXT: sub v7.4s, v6.4s, v17.4s +; CHECK-NEXT: ext v4.16b, v4.16b, v19.16b, #12 +; CHECK-NEXT: sub v18.4s, v6.4s, v16.4s +; CHECK-NEXT: mov v7.d[1], v1.d[1] ; CHECK-NEXT: mov v6.d[1], v2.d[1] ; CHECK-NEXT: add v2.4s, v5.4s, v4.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: rev64 v1.4s, v2.4s -; CHECK-NEXT: ext v16.16b, v16.16b, v19.16b, #8 ; CHECK-NEXT: sub v4.4s, v5.4s, v4.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s +; CHECK-NEXT: add v1.4s, v3.4s, v17.4s ; CHECK-NEXT: uzp2 v5.4s, v0.4s, v2.4s -; CHECK-NEXT: zip2 v1.4s, v1.4s, v0.4s -; CHECK-NEXT: add v18.4s, v3.4s, v16.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v16.4s -; CHECK-NEXT: add v6.4s, v6.4s, v17.4s -; CHECK-NEXT: ext v16.16b, v18.16b, v0.16b, #8 -; CHECK-NEXT: ext v7.16b, v3.16b, v7.16b, #12 -; CHECK-NEXT: ext v2.16b, v5.16b, v2.16b, #4 -; CHECK-NEXT: uzp2 v17.4s, v0.4s, v18.4s -; CHECK-NEXT: ext v18.16b, v6.16b, v1.16b, #8 -; CHECK-NEXT: trn1 v0.4s, v7.4s, v3.4s -; CHECK-NEXT: uzp2 v1.4s, v2.4s, v5.4s -; CHECK-NEXT: dup v2.4s, v4.s[2] -; CHECK-NEXT: rev64 v5.4s, v18.4s -; CHECK-NEXT: dup v4.4s, v6.s[0] -; CHECK-NEXT: trn1 v3.4s, v17.4s, v16.4s -; CHECK-NEXT: mov v0.s[3], v2.s[3] -; CHECK-NEXT: mov v1.s[3], v4.s[3] -; CHECK-NEXT: ext v2.16b, v18.16b, v5.16b, #8 +; CHECK-NEXT: sub v3.4s, v3.4s, v17.4s +; CHECK-NEXT: add v6.4s, v6.4s, v16.4s +; CHECK-NEXT: rev64 v16.4s, v2.4s +; CHECK-NEXT: ext v17.16b, v5.16b, v2.16b, #4 +; CHECK-NEXT: ext v18.16b, v3.16b, v18.16b, #12 +; CHECK-NEXT: ext v7.16b, v1.16b, v0.16b, #8 +; CHECK-NEXT: uzp2 v19.4s, v0.4s, v1.4s +; CHECK-NEXT: zip2 v2.4s, v16.4s, v0.4s +; CHECK-NEXT: uzp2 v1.4s, v17.4s, v5.4s +; CHECK-NEXT: trn1 v0.4s, v18.4s, v3.4s +; CHECK-NEXT: dup v5.4s, v6.s[0] +; CHECK-NEXT: rev64 v6.4s, v6.4s +; CHECK-NEXT: dup v4.4s, v4.s[2] +; CHECK-NEXT: trn1 v3.4s, v19.4s, v7.4s +; CHECK-NEXT: mov v1.s[3], v5.s[3] +; CHECK-NEXT: mov v0.s[3], v4.s[3] +; CHECK-NEXT: mov v2.d[1], v6.d[1] ; CHECK-NEXT: ret %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> %s2 = shufflevector <16 x i32> %y, <16 x i32> %x, <16 x i32> @@ -380,59 +367,57 @@ define <16 x i32> @test2_12(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: test2_12: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v16.16b, v2.16b, v0.16b, #12 -; CHECK-NEXT: dup v18.4s, v3.s[0] -; CHECK-NEXT: zip2 v19.4s, v3.4s, v2.4s -; CHECK-NEXT: uzp1 v7.4s, v5.4s, v4.4s +; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s +; CHECK-NEXT: uzp1 v19.4s, v5.4s, v4.4s ; CHECK-NEXT: uzp2 v17.4s, v6.4s, v0.4s -; CHECK-NEXT: ext v16.16b, v3.16b, v16.16b, #8 -; CHECK-NEXT: trn2 v19.4s, v3.4s, v19.4s -; CHECK-NEXT: ext v1.16b, v5.16b, v5.16b, #12 +; CHECK-NEXT: ext v1.16b, v5.16b, v5.16b, #4 +; CHECK-NEXT: rev64 v7.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v0.16b, #12 +; CHECK-NEXT: trn2 v18.4s, v3.4s, v18.4s +; CHECK-NEXT: trn2 v19.4s, v19.4s, v5.4s ; CHECK-NEXT: trn1 v17.4s, v17.4s, v6.4s -; CHECK-NEXT: rev64 v20.4s, v16.4s -; CHECK-NEXT: trn2 v7.4s, v7.4s, v5.4s -; CHECK-NEXT: mov v6.d[1], v2.d[1] -; CHECK-NEXT: ext v2.16b, v5.16b, v19.16b, #8 -; CHECK-NEXT: ext v16.16b, v16.16b, v20.16b, #8 ; CHECK-NEXT: mov v0.d[1], v4.d[1] -; CHECK-NEXT: ext v4.16b, v4.16b, v7.16b, #12 -; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #8 -; CHECK-NEXT: add v2.4s, v3.4s, v16.4s +; CHECK-NEXT: mov v6.d[1], v2.d[1] +; CHECK-NEXT: dup v2.4s, v3.s[0] +; CHECK-NEXT: mov v18.d[1], v1.d[1] +; CHECK-NEXT: mov v16.d[1], v7.d[1] +; CHECK-NEXT: ext v1.16b, v4.16b, v19.16b, #12 +; CHECK-NEXT: mov v17.s[3], v2.s[3] +; CHECK-NEXT: add v2.4s, v0.4s, v18.4s +; CHECK-NEXT: add v7.4s, v3.4s, v16.4s ; CHECK-NEXT: sub v3.4s, v3.4s, v16.4s -; CHECK-NEXT: mov v17.s[3], v18.s[3] -; CHECK-NEXT: ext v16.16b, v2.16b, v3.16b, #12 -; CHECK-NEXT: add v18.4s, v5.4s, v4.4s -; CHECK-NEXT: add v19.4s, v0.4s, v1.4s -; CHECK-NEXT: add v7.4s, v6.4s, v17.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v16.16b, v2.16b, #4 -; CHECK-NEXT: ext v16.16b, v16.16b, v16.16b, #8 +; CHECK-NEXT: add v16.4s, v5.4s, v1.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v18.4s +; CHECK-NEXT: add v4.4s, v6.4s, v17.4s +; CHECK-NEXT: ext v19.16b, v7.16b, v3.16b, #12 ; CHECK-NEXT: sub v6.4s, v6.4s, v17.4s -; CHECK-NEXT: rev64 v17.4s, v18.4s -; CHECK-NEXT: sub v4.4s, v5.4s, v4.4s -; CHECK-NEXT: ext v1.16b, v1.16b, v16.16b, #12 -; CHECK-NEXT: dup v5.4s, v4.s[2] -; CHECK-NEXT: zip2 v16.4s, v17.4s, v0.4s -; CHECK-NEXT: ext v17.16b, v2.16b, v0.16b, #8 -; CHECK-NEXT: uzp2 v2.4s, v0.4s, v2.4s -; CHECK-NEXT: uzp2 v0.4s, v0.4s, v18.4s -; CHECK-NEXT: ext v20.16b, v3.16b, v6.16b, #12 -; CHECK-NEXT: mov v4.s[2], v18.s[2] -; CHECK-NEXT: ext v16.16b, v7.16b, v16.16b, #8 -; CHECK-NEXT: ext v18.16b, v0.16b, v18.16b, #4 -; CHECK-NEXT: trn1 v20.4s, v20.4s, v3.4s -; CHECK-NEXT: mov v6.s[1], v7.s[1] -; CHECK-NEXT: rev64 v3.4s, v16.4s -; CHECK-NEXT: uzp2 v18.4s, v18.4s, v0.4s -; CHECK-NEXT: trn1 v0.4s, v2.4s, v17.4s -; CHECK-NEXT: dup v2.4s, v7.s[0] -; CHECK-NEXT: mov v20.s[3], v5.s[3] -; CHECK-NEXT: mov v18.s[3], v2.s[3] -; CHECK-NEXT: ext v2.16b, v16.16b, v3.16b, #8 -; CHECK-NEXT: mul v3.4s, v1.4s, v0.4s -; CHECK-NEXT: mul v0.4s, v19.4s, v20.4s -; CHECK-NEXT: mul v1.4s, v4.4s, v18.4s -; CHECK-NEXT: mul v2.4s, v6.4s, v2.4s +; CHECK-NEXT: uzp2 v17.4s, v0.4s, v16.4s +; CHECK-NEXT: sub v1.4s, v5.4s, v1.4s +; CHECK-NEXT: ext v5.16b, v19.16b, v7.16b, #4 +; CHECK-NEXT: mov v20.16b, v1.16b +; CHECK-NEXT: ext v18.16b, v19.16b, v19.16b, #8 +; CHECK-NEXT: ext v19.16b, v3.16b, v6.16b, #12 +; CHECK-NEXT: ext v21.16b, v17.16b, v16.16b, #4 +; CHECK-NEXT: mov v20.s[2], v16.s[2] +; CHECK-NEXT: rev64 v16.4s, v16.4s +; CHECK-NEXT: ext v5.16b, v5.16b, v18.16b, #12 +; CHECK-NEXT: uzp2 v17.4s, v21.4s, v17.4s +; CHECK-NEXT: ext v18.16b, v7.16b, v0.16b, #8 +; CHECK-NEXT: uzp2 v7.4s, v0.4s, v7.4s +; CHECK-NEXT: trn1 v19.4s, v19.4s, v3.4s +; CHECK-NEXT: zip2 v16.4s, v16.4s, v0.4s +; CHECK-NEXT: rev64 v0.4s, v4.4s +; CHECK-NEXT: dup v1.4s, v1.s[2] +; CHECK-NEXT: dup v3.4s, v4.s[0] +; CHECK-NEXT: mov v6.s[1], v4.s[1] +; CHECK-NEXT: trn1 v4.4s, v7.4s, v18.4s +; CHECK-NEXT: mov v19.s[3], v1.s[3] +; CHECK-NEXT: mov v17.s[3], v3.s[3] +; CHECK-NEXT: mov v16.d[1], v0.d[1] +; CHECK-NEXT: mul v3.4s, v5.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v2.4s, v19.4s +; CHECK-NEXT: mul v1.4s, v20.4s, v17.4s +; CHECK-NEXT: mul v2.4s, v6.4s, v16.4s ; CHECK-NEXT: ret %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> %s2 = shufflevector <16 x i32> %y, <16 x i32> %x, <16 x i32> @@ -496,55 +481,53 @@ ; CHECK-LABEL: test3_2: ; CHECK: // %bb.0: ; CHECK-NEXT: uzp2 v16.4s, v5.4s, v6.4s -; CHECK-NEXT: zip1 v17.4s, v2.4s, v3.4s ; CHECK-NEXT: uzp1 v19.4s, v7.4s, v1.4s +; CHECK-NEXT: zip1 v17.4s, v2.4s, v3.4s ; CHECK-NEXT: zip1 v18.4s, v7.4s, v1.4s ; CHECK-NEXT: uzp2 v16.4s, v16.4s, v5.4s -; CHECK-NEXT: ext v21.16b, v2.16b, v17.16b, #8 ; CHECK-NEXT: zip2 v19.4s, v7.4s, v19.4s -; CHECK-NEXT: zip2 v22.4s, v2.4s, v3.4s -; CHECK-NEXT: rev64 v23.4s, v0.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: mov v2.s[3], v3.s[2] -; CHECK-NEXT: rev64 v3.4s, v7.4s ; CHECK-NEXT: zip1 v20.4s, v5.4s, v6.4s +; CHECK-NEXT: ext v21.16b, v2.16b, v17.16b, #8 ; CHECK-NEXT: trn2 v18.4s, v7.4s, v18.4s +; CHECK-NEXT: zip2 v22.4s, v2.4s, v3.4s ; CHECK-NEXT: mov v16.d[1], v19.d[1] -; CHECK-NEXT: trn2 v19.4s, v23.4s, v4.4s -; CHECK-NEXT: zip1 v7.4s, v7.4s, v7.4s -; CHECK-NEXT: trn2 v0.4s, v4.4s, v0.4s -; CHECK-NEXT: zip2 v4.4s, v5.4s, v6.4s -; CHECK-NEXT: zip2 v3.4s, v3.4s, v1.4s +; CHECK-NEXT: rev64 v19.4s, v0.4s +; CHECK-NEXT: mov v2.s[3], v3.s[2] +; CHECK-NEXT: trn2 v3.4s, v5.4s, v6.4s +; CHECK-NEXT: zip2 v5.4s, v5.4s, v6.4s +; CHECK-NEXT: zip1 v6.4s, v7.4s, v7.4s +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: rev64 v7.4s, v7.4s ; CHECK-NEXT: mov v18.d[1], v21.d[1] +; CHECK-NEXT: ext v6.16b, v6.16b, v1.16b, #4 +; CHECK-NEXT: trn2 v0.4s, v4.4s, v0.4s +; CHECK-NEXT: zip2 v1.4s, v7.4s, v1.4s ; CHECK-NEXT: mov v22.d[1], v20.d[1] -; CHECK-NEXT: trn2 v5.4s, v5.4s, v6.4s -; CHECK-NEXT: ext v1.16b, v7.16b, v1.16b, #4 -; CHECK-NEXT: mov v17.d[1], v4.d[1] +; CHECK-NEXT: trn2 v19.4s, v19.4s, v4.4s +; CHECK-NEXT: mov v17.d[1], v5.d[1] +; CHECK-NEXT: mov v3.d[1], v6.d[1] ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: mov v3.d[1], v2.d[1] -; CHECK-NEXT: mov v5.d[1], v1.d[1] -; CHECK-NEXT: sub v1.4s, v22.4s, v17.4s -; CHECK-NEXT: sub v2.4s, v19.4s, v0.4s -; CHECK-NEXT: sub v3.4s, v18.4s, v3.4s -; CHECK-NEXT: ext v4.16b, v1.16b, v2.16b, #12 -; CHECK-NEXT: zip2 v1.4s, v3.4s, v1.4s -; CHECK-NEXT: add v5.4s, v16.4s, v5.4s -; CHECK-NEXT: add v0.4s, v19.4s, v0.4s -; CHECK-NEXT: add v7.4s, v22.4s, v17.4s -; CHECK-NEXT: ext v4.16b, v3.16b, v4.16b, #8 -; CHECK-NEXT: uzp1 v6.4s, v5.4s, v0.4s -; CHECK-NEXT: trn2 v1.4s, v3.4s, v1.4s -; CHECK-NEXT: uzp2 v2.4s, v7.4s, v2.4s -; CHECK-NEXT: rev64 v16.4s, v4.4s -; CHECK-NEXT: trn2 v6.4s, v6.4s, v5.4s -; CHECK-NEXT: ext v18.16b, v5.16b, v1.16b, #8 -; CHECK-NEXT: ext v5.16b, v5.16b, v5.16b, #12 -; CHECK-NEXT: trn1 v2.4s, v2.4s, v7.4s -; CHECK-NEXT: dup v17.4s, v3.s[0] -; CHECK-NEXT: ext v3.16b, v4.16b, v16.16b, #8 -; CHECK-NEXT: ext v1.16b, v0.16b, v6.16b, #12 -; CHECK-NEXT: ext v0.16b, v18.16b, v5.16b, #8 -; CHECK-NEXT: mov v2.s[3], v17.s[3] +; CHECK-NEXT: mov v1.d[1], v2.d[1] +; CHECK-NEXT: add v2.4s, v22.4s, v17.4s +; CHECK-NEXT: add v4.4s, v16.4s, v3.4s +; CHECK-NEXT: add v5.4s, v19.4s, v0.4s +; CHECK-NEXT: sub v1.4s, v18.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v22.4s, v17.4s +; CHECK-NEXT: sub v6.4s, v19.4s, v0.4s +; CHECK-NEXT: zip2 v0.4s, v1.4s, v3.4s +; CHECK-NEXT: ext v3.16b, v3.16b, v6.16b, #12 +; CHECK-NEXT: uzp1 v16.4s, v4.4s, v5.4s +; CHECK-NEXT: uzp2 v6.4s, v2.4s, v6.4s +; CHECK-NEXT: rev64 v7.4s, v1.4s +; CHECK-NEXT: trn2 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ext v17.16b, v4.16b, v4.16b, #4 +; CHECK-NEXT: trn2 v4.4s, v16.4s, v4.4s +; CHECK-NEXT: trn1 v2.4s, v6.4s, v2.4s +; CHECK-NEXT: dup v16.4s, v1.s[0] +; CHECK-NEXT: mov v3.d[1], v7.d[1] +; CHECK-NEXT: mov v0.d[1], v17.d[1] +; CHECK-NEXT: ext v1.16b, v5.16b, v4.16b, #12 +; CHECK-NEXT: mov v2.s[3], v16.s[3] ; CHECK-NEXT: ret %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> %s2 = shufflevector <16 x i32> %y, <16 x i32> %x, <16 x i32> @@ -559,62 +542,60 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: uzp1 v18.4s, v7.4s, v1.4s ; CHECK-NEXT: uzp2 v19.4s, v5.4s, v6.4s -; CHECK-NEXT: zip1 v21.4s, v7.4s, v1.4s ; CHECK-NEXT: zip1 v20.4s, v2.4s, v3.4s +; CHECK-NEXT: zip1 v21.4s, v7.4s, v1.4s ; CHECK-NEXT: zip2 v18.4s, v7.4s, v18.4s ; CHECK-NEXT: uzp2 v19.4s, v19.4s, v5.4s -; CHECK-NEXT: trn2 v21.4s, v7.4s, v21.4s ; CHECK-NEXT: rev64 v23.4s, v0.4s ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: zip1 v16.4s, v5.4s, v6.4s ; CHECK-NEXT: mov v19.d[1], v18.d[1] ; CHECK-NEXT: zip1 v18.4s, v7.4s, v7.4s -; CHECK-NEXT: rev64 v7.4s, v7.4s -; CHECK-NEXT: zip1 v16.4s, v5.4s, v6.4s ; CHECK-NEXT: zip2 v17.4s, v2.4s, v3.4s ; CHECK-NEXT: ext v22.16b, v2.16b, v20.16b, #8 +; CHECK-NEXT: trn2 v21.4s, v7.4s, v21.4s ; CHECK-NEXT: mov v2.s[3], v3.s[2] -; CHECK-NEXT: trn2 v3.4s, v5.4s, v6.4s -; CHECK-NEXT: zip2 v7.4s, v7.4s, v1.4s -; CHECK-NEXT: zip2 v5.4s, v5.4s, v6.4s +; CHECK-NEXT: rev64 v3.4s, v7.4s +; CHECK-NEXT: trn2 v7.4s, v5.4s, v6.4s +; CHECK-NEXT: ext v18.16b, v18.16b, v1.16b, #4 ; CHECK-NEXT: trn2 v0.4s, v4.4s, v0.4s ; CHECK-NEXT: mov v17.d[1], v16.d[1] -; CHECK-NEXT: mov v21.d[1], v22.d[1] ; CHECK-NEXT: trn2 v16.4s, v23.4s, v4.4s -; CHECK-NEXT: ext v1.16b, v18.16b, v1.16b, #4 -; CHECK-NEXT: mov v7.d[1], v2.d[1] -; CHECK-NEXT: mov v20.d[1], v5.d[1] +; CHECK-NEXT: zip2 v1.4s, v3.4s, v1.4s +; CHECK-NEXT: zip2 v3.4s, v5.4s, v6.4s +; CHECK-NEXT: mov v7.d[1], v18.d[1] ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: mov v3.d[1], v1.d[1] -; CHECK-NEXT: add v4.4s, v21.4s, v7.4s -; CHECK-NEXT: sub v5.4s, v17.4s, v20.4s -; CHECK-NEXT: sub v6.4s, v16.4s, v0.4s -; CHECK-NEXT: sub v7.4s, v21.4s, v7.4s -; CHECK-NEXT: add v2.4s, v17.4s, v20.4s -; CHECK-NEXT: ext v17.16b, v5.16b, v6.16b, #12 -; CHECK-NEXT: zip2 v18.4s, v7.4s, v5.4s -; CHECK-NEXT: add v1.4s, v19.4s, v3.4s -; CHECK-NEXT: add v0.4s, v16.4s, v0.4s -; CHECK-NEXT: sub v3.4s, v19.4s, v3.4s -; CHECK-NEXT: uzp1 v16.4s, v1.4s, v0.4s -; CHECK-NEXT: trn2 v18.4s, v7.4s, v18.4s -; CHECK-NEXT: ext v17.16b, v7.16b, v17.16b, #8 -; CHECK-NEXT: uzp2 v19.4s, v2.4s, v6.4s -; CHECK-NEXT: mov v5.d[1], v2.d[1] -; CHECK-NEXT: trn2 v16.4s, v16.4s, v1.4s -; CHECK-NEXT: ext v18.16b, v1.16b, v18.16b, #8 -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #12 -; CHECK-NEXT: trn1 v2.4s, v19.4s, v2.4s -; CHECK-NEXT: rev64 v19.4s, v17.4s -; CHECK-NEXT: dup v7.4s, v7.s[0] -; CHECK-NEXT: ext v16.16b, v0.16b, v16.16b, #12 -; CHECK-NEXT: mov v2.s[3], v7.s[3] -; CHECK-NEXT: mov v0.d[1], v6.d[1] -; CHECK-NEXT: ext v6.16b, v18.16b, v1.16b, #8 -; CHECK-NEXT: ext v7.16b, v17.16b, v19.16b, #8 -; CHECK-NEXT: add v2.4s, v5.4s, v2.4s -; CHECK-NEXT: add v1.4s, v3.4s, v16.4s -; CHECK-NEXT: add v0.4s, v0.4s, v6.4s -; CHECK-NEXT: add v3.4s, v4.4s, v7.4s +; CHECK-NEXT: mov v21.d[1], v22.d[1] +; CHECK-NEXT: mov v1.d[1], v2.d[1] +; CHECK-NEXT: mov v20.d[1], v3.d[1] +; CHECK-NEXT: add v2.4s, v19.4s, v7.4s +; CHECK-NEXT: add v3.4s, v16.4s, v0.4s +; CHECK-NEXT: add v4.4s, v21.4s, v1.4s +; CHECK-NEXT: uzp1 v5.4s, v2.4s, v3.4s +; CHECK-NEXT: add v6.4s, v17.4s, v20.4s +; CHECK-NEXT: sub v1.4s, v21.4s, v1.4s +; CHECK-NEXT: sub v17.4s, v17.4s, v20.4s +; CHECK-NEXT: sub v0.4s, v16.4s, v0.4s +; CHECK-NEXT: trn2 v5.4s, v5.4s, v2.4s +; CHECK-NEXT: zip2 v16.4s, v1.4s, v17.4s +; CHECK-NEXT: uzp2 v20.4s, v6.4s, v0.4s +; CHECK-NEXT: ext v18.16b, v17.16b, v0.16b, #12 +; CHECK-NEXT: ext v5.16b, v3.16b, v5.16b, #12 +; CHECK-NEXT: mov v17.d[1], v6.d[1] +; CHECK-NEXT: mov v3.d[1], v0.d[1] +; CHECK-NEXT: trn2 v0.4s, v1.4s, v16.4s +; CHECK-NEXT: trn1 v6.4s, v20.4s, v6.4s +; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: dup v16.4s, v1.s[0] +; CHECK-NEXT: rev64 v1.4s, v1.4s +; CHECK-NEXT: mov v6.s[3], v16.s[3] +; CHECK-NEXT: mov v0.d[1], v2.d[1] +; CHECK-NEXT: sub v7.4s, v19.4s, v7.4s +; CHECK-NEXT: mov v18.d[1], v1.d[1] +; CHECK-NEXT: add v2.4s, v17.4s, v6.4s +; CHECK-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-NEXT: add v1.4s, v7.4s, v5.4s +; CHECK-NEXT: add v3.4s, v4.4s, v18.4s ; CHECK-NEXT: ret %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> %s2 = shufflevector <16 x i32> %y, <16 x i32> %x, <16 x i32> @@ -630,91 +611,86 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: test23: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v16.16b, v2.16b, v0.16b, #12 -; CHECK-NEXT: dup v19.4s, v3.s[0] -; CHECK-NEXT: uzp1 v7.4s, v5.4s, v4.4s -; CHECK-NEXT: mov v17.16b, v0.16b +; CHECK-NEXT: uzp2 v17.4s, v6.4s, v0.4s ; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s -; CHECK-NEXT: uzp2 v0.4s, v6.4s, v0.4s -; CHECK-NEXT: ext v16.16b, v3.16b, v16.16b, #8 -; CHECK-NEXT: trn2 v7.4s, v7.4s, v5.4s +; CHECK-NEXT: uzp1 v19.4s, v5.4s, v4.4s +; CHECK-NEXT: ext v1.16b, v5.16b, v5.16b, #4 +; CHECK-NEXT: rev64 v7.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v0.16b, #12 +; CHECK-NEXT: trn1 v17.4s, v17.4s, v6.4s ; CHECK-NEXT: trn2 v18.4s, v3.4s, v18.4s -; CHECK-NEXT: trn1 v0.4s, v0.4s, v6.4s -; CHECK-NEXT: rev64 v20.4s, v16.4s -; CHECK-NEXT: ext v1.16b, v5.16b, v5.16b, #12 -; CHECK-NEXT: mov v17.d[1], v4.d[1] -; CHECK-NEXT: ext v18.16b, v5.16b, v18.16b, #8 +; CHECK-NEXT: trn2 v19.4s, v19.4s, v5.4s ; CHECK-NEXT: mov v6.d[1], v2.d[1] -; CHECK-NEXT: mov v0.s[3], v19.s[3] -; CHECK-NEXT: ext v2.16b, v16.16b, v20.16b, #8 -; CHECK-NEXT: ext v4.16b, v4.16b, v7.16b, #12 -; CHECK-NEXT: ext v1.16b, v18.16b, v1.16b, #8 -; CHECK-NEXT: add v7.4s, v6.4s, v0.4s -; CHECK-NEXT: add v18.4s, v3.4s, v2.4s -; CHECK-NEXT: add v19.4s, v5.4s, v4.4s -; CHECK-NEXT: sub v2.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v0.4s, v6.4s, v0.4s -; CHECK-NEXT: zip1 v6.4s, v7.4s, v18.4s -; CHECK-NEXT: sub v3.4s, v5.4s, v4.4s -; CHECK-NEXT: zip1 v4.4s, v2.4s, v19.4s -; CHECK-NEXT: uzp1 v20.4s, v2.4s, v19.4s -; CHECK-NEXT: ext v5.16b, v7.16b, v6.16b, #8 -; CHECK-NEXT: uzp2 v21.4s, v3.4s, v0.4s -; CHECK-NEXT: trn2 v4.4s, v2.4s, v4.4s -; CHECK-NEXT: add v16.4s, v17.4s, v1.4s -; CHECK-NEXT: sub v1.4s, v17.4s, v1.4s -; CHECK-NEXT: zip2 v17.4s, v2.4s, v20.4s -; CHECK-NEXT: uzp2 v20.4s, v21.4s, v3.4s -; CHECK-NEXT: mov v4.d[1], v5.d[1] -; CHECK-NEXT: rev64 v5.4s, v16.4s -; CHECK-NEXT: zip1 v21.4s, v2.4s, v2.4s -; CHECK-NEXT: ext v16.16b, v16.16b, v16.16b, #4 -; CHECK-NEXT: mov v20.d[1], v17.d[1] -; CHECK-NEXT: trn2 v17.4s, v3.4s, v0.4s -; CHECK-NEXT: ext v21.16b, v21.16b, v19.16b, #4 -; CHECK-NEXT: trn2 v5.4s, v5.4s, v1.4s -; CHECK-NEXT: trn2 v1.4s, v1.4s, v16.4s -; CHECK-NEXT: rev64 v2.4s, v2.4s -; CHECK-NEXT: mov v17.d[1], v21.d[1] -; CHECK-NEXT: zip2 v16.4s, v7.4s, v18.4s -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4 -; CHECK-NEXT: mov v7.s[3], v18.s[2] -; CHECK-NEXT: zip2 v2.4s, v2.4s, v19.4s -; CHECK-NEXT: zip1 v21.4s, v3.4s, v0.4s -; CHECK-NEXT: zip2 v0.4s, v3.4s, v0.4s -; CHECK-NEXT: add v3.4s, v20.4s, v17.4s -; CHECK-NEXT: add v18.4s, v5.4s, v1.4s -; CHECK-NEXT: mov v2.d[1], v7.d[1] -; CHECK-NEXT: uzp1 v7.4s, v3.4s, v18.4s -; CHECK-NEXT: mov v16.d[1], v21.d[1] -; CHECK-NEXT: mov v6.d[1], v0.d[1] -; CHECK-NEXT: add v19.4s, v4.4s, v2.4s -; CHECK-NEXT: trn2 v0.4s, v7.4s, v3.4s -; CHECK-NEXT: sub v2.4s, v4.4s, v2.4s -; CHECK-NEXT: sub v4.4s, v16.4s, v6.4s -; CHECK-NEXT: sub v1.4s, v5.4s, v1.4s -; CHECK-NEXT: ext v5.16b, v18.16b, v0.16b, #12 -; CHECK-NEXT: zip2 v0.4s, v2.4s, v4.4s -; CHECK-NEXT: ext v7.16b, v4.16b, v1.16b, #12 -; CHECK-NEXT: add v6.4s, v16.4s, v6.4s +; CHECK-NEXT: dup v2.4s, v3.s[0] ; CHECK-NEXT: mov v18.d[1], v1.d[1] -; CHECK-NEXT: trn2 v0.4s, v2.4s, v0.4s -; CHECK-NEXT: uzp2 v1.4s, v6.4s, v1.4s -; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #8 -; CHECK-NEXT: mov v4.d[1], v6.d[1] -; CHECK-NEXT: ext v0.16b, v3.16b, v0.16b, #8 -; CHECK-NEXT: trn1 v1.4s, v1.4s, v6.4s -; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #12 -; CHECK-NEXT: rev64 v6.4s, v7.4s +; CHECK-NEXT: mov v17.s[3], v2.s[3] +; CHECK-NEXT: ext v1.16b, v4.16b, v19.16b, #12 +; CHECK-NEXT: mov v16.d[1], v7.d[1] +; CHECK-NEXT: mov v0.d[1], v4.d[1] +; CHECK-NEXT: add v4.4s, v6.4s, v17.4s +; CHECK-NEXT: add v7.4s, v5.4s, v1.4s +; CHECK-NEXT: add v19.4s, v3.4s, v16.4s +; CHECK-NEXT: sub v6.4s, v6.4s, v17.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v16.4s +; CHECK-NEXT: sub v1.4s, v5.4s, v1.4s +; CHECK-NEXT: add v2.4s, v0.4s, v18.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v18.4s +; CHECK-NEXT: uzp1 v5.4s, v3.4s, v7.4s +; CHECK-NEXT: zip1 v16.4s, v1.4s, v6.4s +; CHECK-NEXT: zip2 v17.4s, v4.4s, v19.4s +; CHECK-NEXT: uzp2 v18.4s, v1.4s, v6.4s +; CHECK-NEXT: zip1 v20.4s, v4.4s, v19.4s +; CHECK-NEXT: zip1 v21.4s, v3.4s, v7.4s +; CHECK-NEXT: zip2 v5.4s, v3.4s, v5.4s +; CHECK-NEXT: uzp2 v18.4s, v18.4s, v1.4s +; CHECK-NEXT: mov v17.d[1], v16.d[1] +; CHECK-NEXT: ext v16.16b, v4.16b, v20.16b, #8 +; CHECK-NEXT: trn2 v21.4s, v3.4s, v21.4s +; CHECK-NEXT: mov v18.d[1], v5.d[1] +; CHECK-NEXT: rev64 v5.4s, v2.4s +; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: mov v21.d[1], v16.d[1] +; CHECK-NEXT: zip1 v16.4s, v3.4s, v3.4s +; CHECK-NEXT: trn2 v5.4s, v5.4s, v0.4s +; CHECK-NEXT: trn2 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: rev64 v2.4s, v3.4s +; CHECK-NEXT: ext v3.16b, v16.16b, v7.16b, #4 +; CHECK-NEXT: trn2 v16.4s, v1.4s, v6.4s +; CHECK-NEXT: mov v4.s[3], v19.s[2] +; CHECK-NEXT: zip2 v2.4s, v2.4s, v7.4s +; CHECK-NEXT: zip2 v1.4s, v1.4s, v6.4s +; CHECK-NEXT: mov v16.d[1], v3.d[1] +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: mov v2.d[1], v4.d[1] +; CHECK-NEXT: mov v20.d[1], v1.d[1] +; CHECK-NEXT: add v1.4s, v18.4s, v16.4s +; CHECK-NEXT: add v3.4s, v5.4s, v0.4s +; CHECK-NEXT: add v4.4s, v21.4s, v2.4s +; CHECK-NEXT: uzp1 v6.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v2.4s, v21.4s, v2.4s +; CHECK-NEXT: sub v0.4s, v5.4s, v0.4s +; CHECK-NEXT: sub v5.4s, v17.4s, v20.4s +; CHECK-NEXT: add v17.4s, v17.4s, v20.4s +; CHECK-NEXT: trn2 v6.4s, v6.4s, v1.4s +; CHECK-NEXT: zip2 v19.4s, v2.4s, v5.4s +; CHECK-NEXT: uzp2 v20.4s, v17.4s, v0.4s +; CHECK-NEXT: ext v7.16b, v5.16b, v0.16b, #12 +; CHECK-NEXT: ext v6.16b, v3.16b, v6.16b, #12 +; CHECK-NEXT: mov v5.d[1], v17.d[1] +; CHECK-NEXT: mov v3.d[1], v0.d[1] +; CHECK-NEXT: trn2 v0.4s, v2.4s, v19.4s +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: trn1 v17.4s, v20.4s, v17.4s +; CHECK-NEXT: rev64 v19.4s, v2.4s ; CHECK-NEXT: dup v2.4s, v2.s[0] -; CHECK-NEXT: mov v1.s[3], v2.s[3] -; CHECK-NEXT: ext v0.16b, v0.16b, v3.16b, #8 -; CHECK-NEXT: ext v6.16b, v7.16b, v6.16b, #8 -; CHECK-NEXT: sub v3.4s, v20.4s, v17.4s -; CHECK-NEXT: add v2.4s, v4.4s, v1.4s -; CHECK-NEXT: add v0.4s, v18.4s, v0.4s -; CHECK-NEXT: add v1.4s, v3.4s, v5.4s -; CHECK-NEXT: add v3.4s, v19.4s, v6.4s +; CHECK-NEXT: mov v0.d[1], v1.d[1] +; CHECK-NEXT: mov v17.s[3], v2.s[3] +; CHECK-NEXT: mov v7.d[1], v19.d[1] +; CHECK-NEXT: sub v1.4s, v18.4s, v16.4s +; CHECK-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-NEXT: add v2.4s, v5.4s, v17.4s +; CHECK-NEXT: add v1.4s, v1.4s, v6.4s +; CHECK-NEXT: add v3.4s, v4.4s, v7.4s ; CHECK-NEXT: ret %s10 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> %s20 = shufflevector <16 x i32> %y, <16 x i32> %x, <16 x i32> Index: llvm/test/CodeGen/AArch64/shuffles.ll =================================================================== --- llvm/test/CodeGen/AArch64/shuffles.ll +++ llvm/test/CodeGen/AArch64/shuffles.ll @@ -4,22 +4,21 @@ define <16 x i32> @test_shuf1(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: test_shuf1: ; CHECK: // %bb.0: -; CHECK-NEXT: zip2 v3.4s, v7.4s, v6.4s -; CHECK-NEXT: ext v5.16b, v6.16b, v4.16b, #12 -; CHECK-NEXT: uzp1 v6.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v4.4s, v2.4s, v4.4s -; CHECK-NEXT: trn2 v3.4s, v7.4s, v3.4s -; CHECK-NEXT: ext v5.16b, v7.16b, v5.16b, #8 -; CHECK-NEXT: trn2 v6.4s, v6.4s, v1.4s -; CHECK-NEXT: trn1 v2.4s, v4.4s, v2.4s -; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #12 -; CHECK-NEXT: ext v3.16b, v1.16b, v3.16b, #8 -; CHECK-NEXT: rev64 v16.4s, v5.4s -; CHECK-NEXT: dup v7.4s, v7.s[0] -; CHECK-NEXT: ext v1.16b, v0.16b, v6.16b, #12 -; CHECK-NEXT: mov v2.s[3], v7.s[3] -; CHECK-NEXT: ext v0.16b, v3.16b, v4.16b, #8 -; CHECK-NEXT: ext v3.16b, v5.16b, v16.16b, #8 +; CHECK-NEXT: uzp1 v16.4s, v1.4s, v0.4s +; CHECK-NEXT: ext v3.16b, v6.16b, v4.16b, #12 +; CHECK-NEXT: zip2 v6.4s, v7.4s, v6.4s +; CHECK-NEXT: uzp2 v17.4s, v2.4s, v4.4s +; CHECK-NEXT: trn2 v16.4s, v16.4s, v1.4s +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: trn2 v4.4s, v7.4s, v6.4s +; CHECK-NEXT: rev64 v5.4s, v7.4s +; CHECK-NEXT: trn1 v2.4s, v17.4s, v2.4s +; CHECK-NEXT: dup v6.4s, v7.s[0] +; CHECK-NEXT: mov v4.d[1], v1.d[1] +; CHECK-NEXT: mov v3.d[1], v5.d[1] +; CHECK-NEXT: ext v1.16b, v0.16b, v16.16b, #12 +; CHECK-NEXT: mov v2.s[3], v6.s[3] +; CHECK-NEXT: mov v0.16b, v4.16b ; CHECK-NEXT: ret %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> ret <16 x i32> %s3 @@ -29,10 +28,9 @@ ; CHECK-LABEL: test_shuf2: ; CHECK: // %bb.0: ; CHECK-NEXT: zip2 v0.4s, v7.4s, v6.4s -; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #12 +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4 ; CHECK-NEXT: trn2 v0.4s, v7.4s, v0.4s -; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #8 -; CHECK-NEXT: ext v0.16b, v0.16b, v2.16b, #8 +; CHECK-NEXT: mov v0.d[1], v1.d[1] ; CHECK-NEXT: ret %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> ret <4 x i32> %s3 @@ -64,10 +62,9 @@ define <4 x i32> @test_shuf5(<16 x i32> %x, <16 x i32> %y) { ; CHECK-LABEL: test_shuf5: ; CHECK: // %bb.0: +; CHECK-NEXT: rev64 v1.4s, v7.4s ; CHECK-NEXT: ext v0.16b, v6.16b, v4.16b, #12 -; CHECK-NEXT: ext v0.16b, v7.16b, v0.16b, #8 -; CHECK-NEXT: rev64 v1.4s, v0.4s -; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: mov v0.d[1], v1.d[1] ; CHECK-NEXT: ret %s3 = shufflevector <16 x i32> %x, <16 x i32> %y, <4 x i32> ret <4 x i32> %s3