diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5655,13 +5655,12 @@ def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx), (bf16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>; -// All concat_vectors operations are canonicalised to act on i64 vectors for -// AArch64. In the general case we need an instruction, which had just as well be -// INS. +// The only legal concat_vectors operation on AArch64 without SVE is +// concatenating two 64-bit vectors. In the general case, lower it to ZIP1. class ConcatPat : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)), - (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1, - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>; + (ZIP1v2i64 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub))>; def : ConcatPat; def : ConcatPat; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-concat-vectors.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-concat-vectors.mir --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-concat-vectors.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-concat-vectors.mir @@ -21,10 +21,10 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1 ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.dsub + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY]], %subreg.dsub - ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG1]], 1, [[INSERT_SUBREG]], 0 + ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.dsub + ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0 ; CHECK: $q0 = COPY [[INSvi64lane]] ; CHECK: RET_ReallyLR implicit $q0 %0:fpr(<2 x s32>) = COPY $d0 @@ -54,10 +54,10 @@ ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0 ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1 ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.dsub + ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF - ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY]], %subreg.dsub - ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG1]], 1, [[INSERT_SUBREG]], 0 + ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.dsub + ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0 ; CHECK: $q0 = COPY [[INSvi64lane]] ; CHECK: RET_ReallyLR implicit $q0 %0:fpr(<4 x s16>) = COPY $d0 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1511,7 +1511,7 @@ define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 { ; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> @@ -1522,7 +1522,7 @@ ; CHECK-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1549,7 +1549,7 @@ ; CHECK-LABEL: test_concat_v16i8_v16i8_v8i8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecext = extractelement <16 x i8> %x, i32 0 @@ -1592,7 +1592,7 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1633,7 +1633,7 @@ define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 { ; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> @@ -1644,7 +1644,7 @@ ; CHECK-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -1663,7 +1663,7 @@ ; CHECK-LABEL: test_concat_v8i16_v8i16_v4i16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecext = extractelement <8 x i16> %x, i32 0 @@ -1690,7 +1690,7 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -1715,7 +1715,7 @@ define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 { ; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> @@ -1726,7 +1726,7 @@ ; CHECK-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecext = extractelement <2 x i32> %x, i32 0 @@ -1741,7 +1741,7 @@ ; CHECK-LABEL: test_concat_v4i32_v4i32_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecext = extractelement <4 x i32> %x, i32 0 @@ -1760,7 +1760,7 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecinit6 = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> @@ -1809,7 +1809,7 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %vecext = extractelement <1 x i64> %x, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-vcombine.ll b/llvm/test/CodeGen/AArch64/arm64-vcombine.ll --- a/llvm/test/CodeGen/AArch64/arm64-vcombine.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcombine.ll @@ -1,12 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s ; LowerCONCAT_VECTORS() was reversing the order of two parts. ; rdar://11558157 ; rdar://11559553 define <16 x i8> @test(<16 x i8> %q0, <16 x i8> %q1, i8* nocapture %dest) nounwind { -entry: ; CHECK-LABEL: test: -; CHECK: mov.d v0[1], v1[0] +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: zip1.2d v0, v0, v1 +; CHECK-NEXT: ret +entry: %0 = bitcast <16 x i8> %q0 to <2 x i64> %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> zeroinitializer %1 = bitcast <16 x i8> %q1 to <2 x i64> diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll @@ -90,7 +90,7 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x bfloat> %low, <4 x bfloat> %high, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -24,7 +24,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %v16i8 = shufflevector <8 x i8> %A, <8 x i8> %B, <16 x i32> ret <16 x i8> %v16i8 @@ -44,7 +44,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %v8i16 = shufflevector <4 x i16> %A, <4 x i16> %B, <8 x i32> ret <8 x i16> %v8i16 @@ -67,7 +67,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %v4i32 = shufflevector <2 x i32> %A, <2 x i32> %B, <4 x i32> ret <4 x i32> %v4i32 @@ -100,7 +100,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %v8half= shufflevector <4 x half> %A, <4 x half> %B, <8 x i32> ret <8 x half> %v8half diff --git a/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll b/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll --- a/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll @@ -13,7 +13,7 @@ ; CHECK-NEXT: dup v0.4s, w0 ; CHECK-NEXT: movi v1.4h, #4 ; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: xtn v1.8b, v0.8h ; CHECK-NEXT: xtn2 v1.16b, v0.8h ; CHECK-NEXT: str q1, [x8] diff --git a/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll --- a/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/fp16-v16-instructions.ll @@ -9,12 +9,12 @@ ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: scvtf v3.4s, v3.4s ; CHECK-NEXT: scvtf v2.4s, v2.4s -; CHECK-NEXT: fcvtn v4.4h, v1.4s +; CHECK-NEXT: fcvtn v1.4h, v1.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NEXT: fcvtn v3.4h, v3.4s -; CHECK-NEXT: fcvtn v1.4h, v2.4s -; CHECK-NEXT: mov v0.d[1], v4.d[0] -; CHECK-NEXT: mov v1.d[1], v3.d[0] +; CHECK-NEXT: fcvtn v2.4h, v2.4s +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v2.2d, v3.2d ; CHECK-NEXT: ret %1 = sitofp <16 x i32> %a to <16 x half> @@ -41,12 +41,12 @@ ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d ; CHECK-NEXT: fcvtn2 v6.4s, v7.2d ; CHECK-NEXT: fcvtn2 v4.4s, v5.2d -; CHECK-NEXT: fcvtn v2.4h, v2.4s +; CHECK-NEXT: fcvtn v1.4h, v2.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: fcvtn v3.4h, v6.4s -; CHECK-NEXT: fcvtn v1.4h, v4.4s -; CHECK-NEXT: mov v0.d[1], v2.d[0] -; CHECK-NEXT: mov v1.d[1], v3.d[0] +; CHECK-NEXT: fcvtn v2.4h, v6.4s +; CHECK-NEXT: fcvtn v3.4h, v4.4s +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v3.2d, v2.2d ; CHECK-NEXT: ret @@ -64,12 +64,12 @@ ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ucvtf v3.4s, v3.4s ; CHECK-NEXT: ucvtf v2.4s, v2.4s -; CHECK-NEXT: fcvtn v4.4h, v1.4s +; CHECK-NEXT: fcvtn v1.4h, v1.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NEXT: fcvtn v3.4h, v3.4s -; CHECK-NEXT: fcvtn v1.4h, v2.4s -; CHECK-NEXT: mov v0.d[1], v4.d[0] -; CHECK-NEXT: mov v1.d[1], v3.d[0] +; CHECK-NEXT: fcvtn v2.4h, v2.4s +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v2.2d, v3.2d ; CHECK-NEXT: ret %1 = uitofp <16 x i32> %a to <16 x half> @@ -96,12 +96,12 @@ ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d ; CHECK-NEXT: fcvtn2 v6.4s, v7.2d ; CHECK-NEXT: fcvtn2 v4.4s, v5.2d -; CHECK-NEXT: fcvtn v2.4h, v2.4s +; CHECK-NEXT: fcvtn v1.4h, v2.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: fcvtn v3.4h, v6.4s -; CHECK-NEXT: fcvtn v1.4h, v4.4s -; CHECK-NEXT: mov v0.d[1], v2.d[0] -; CHECK-NEXT: mov v1.d[1], v3.d[0] +; CHECK-NEXT: fcvtn v2.4h, v6.4s +; CHECK-NEXT: fcvtn v3.4h, v4.4s +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip1 v1.2d, v3.2d, v2.2d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll --- a/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/fp16-v8-instructions.ll @@ -300,7 +300,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: fcvtn v1.4h, v1.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %1 = fptrunc <8 x float> %a to <8 x half> ret <8 x half> %1 @@ -401,7 +401,7 @@ ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: fcvtn v1.4h, v1.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %1 = sitofp <8 x i8> %a to <8 x half> ret <8 x half> %1 @@ -417,7 +417,7 @@ ; CHECK-CVT-NEXT: scvtf v0.4s, v0.4s ; CHECK-CVT-NEXT: fcvtn v1.4h, v1.4s ; CHECK-CVT-NEXT: fcvtn v0.4h, v0.4s -; CHECK-CVT-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: sitofp_i16: @@ -436,7 +436,7 @@ ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: fcvtn v1.4h, v1.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %1 = sitofp <8 x i32> %a to <8 x half> ret <8 x half> %1 @@ -456,7 +456,7 @@ ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d ; CHECK-NEXT: fcvtn v1.4h, v2.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %1 = sitofp <8 x i64> %a to <8 x half> ret <8 x half> %1 @@ -472,7 +472,7 @@ ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: fcvtn v1.4h, v1.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %1 = uitofp <8 x i8> %a to <8 x half> ret <8 x half> %1 @@ -488,7 +488,7 @@ ; CHECK-CVT-NEXT: ucvtf v0.4s, v0.4s ; CHECK-CVT-NEXT: fcvtn v1.4h, v1.4s ; CHECK-CVT-NEXT: fcvtn v0.4h, v0.4s -; CHECK-CVT-NEXT: mov v0.d[1], v1.d[0] +; CHECK-CVT-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-CVT-NEXT: ret ; ; CHECK-FP16-LABEL: uitofp_i16: @@ -507,7 +507,7 @@ ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: fcvtn v1.4h, v1.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %1 = uitofp <8 x i32> %a to <8 x half> ret <8 x half> %1 @@ -527,7 +527,7 @@ ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d ; CHECK-NEXT: fcvtn v1.4h, v2.4s ; CHECK-NEXT: fcvtn v0.4h, v0.4s -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %1 = uitofp <8 x i64> %a to <8 x half> ret <8 x half> %1 diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll --- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll @@ -276,7 +276,7 @@ ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret entry: %shuffle.i = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -895,7 +895,7 @@ ; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI89_0] ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: tbl v0.8b, { v0.16b }, v2.8b ; CHECK-NEXT: ret %c = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/shuffle-mask-legal.ll b/llvm/test/CodeGen/AArch64/shuffle-mask-legal.ll --- a/llvm/test/CodeGen/AArch64/shuffle-mask-legal.ll +++ b/llvm/test/CodeGen/AArch64/shuffle-mask-legal.ll @@ -7,7 +7,7 @@ ; CHECK-LABEL: PR41535: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4 -; CHECK-NEXT: mov v0.d[1], v0.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v0.2d ; CHECK-NEXT: ret %cat1 = shufflevector <2 x i32> %p1, <2 x i32> undef, <4 x i32> %cat2 = shufflevector <2 x i32> %p2, <2 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll --- a/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll +++ b/llvm/test/CodeGen/AArch64/speculation-hardening-loads.ll @@ -112,7 +112,7 @@ ; CHECK-NEXT: and x1, x1, x16 ; CHECK-NEXT: csdb ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: mov v0.d[1], v0.d[0] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v0.2d ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: mov [[TMPREG:x[0-9]+]], sp ; CHECK-NEXT: and [[TMPREG]], [[TMPREG]], x16 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll @@ -36,7 +36,7 @@ ; Don't use SVE for 128-bit vectors. define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; CHECK-LABEL: concat_v16i8: -; CHECK: mov v0.d[1], v1.d[0] +; CHECK: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> @@ -179,7 +179,7 @@ ; Don't use SVE for 128-bit vectors. define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; CHECK-LABEL: concat_v8i16: -; CHECK: mov v0.d[1], v1.d[0] +; CHECK: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> ret <8 x i16> %res @@ -291,7 +291,7 @@ ; Don't use SVE for 128-bit vectors. define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) #0 { ; CHECK-LABEL: concat_v4i32: -; CHECK: mov v0.d[1], v1.d[0] +; CHECK: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> ret <4 x i32> %res @@ -379,7 +379,7 @@ ; Don't use SVE for 128-bit vectors. define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) #0 { ; CHECK-LABEL: concat_v2i64: -; CHECK: mov v0.d[1], v1.d[0] +; CHECK: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> ret <2 x i64> %res @@ -470,7 +470,7 @@ ; Don't use SVE for 128-bit vectors. define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) #0 { ; CHECK-LABEL: concat_v8f16: -; CHECK: mov v0.d[1], v1.d[0] +; CHECK: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> ret <8 x half> %res @@ -582,7 +582,7 @@ ; Don't use SVE for 128-bit vectors. define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) #0 { ; CHECK-LABEL: concat_v4f32: -; CHECK: mov v0.d[1], v1.d[0] +; CHECK: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> ret <4 x float> %res @@ -670,7 +670,7 @@ ; Don't use SVE for 128-bit vectors. define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) #0 { ; CHECK-LABEL: concat_v2f64: -; CHECK: mov v0.d[1], v1.d[0] +; CHECK: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> ret <2 x double> %res diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -468,9 +468,9 @@ ; VBITS_EQ_256-DAG: fcvt [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].d ; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s ; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h ; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h -; VBITS_EQ_256-DAG: mov v0.d[1], v[[RES_HI]].d[0] +; VBITS_EQ_256-DAG: zip1 v0.2d, v[[RES_LO:[0-9]+]].2d, v[[RES_HI:[0-9]+]].2d %op1 = load <8 x double>, <8 x double>* %a %res = fptrunc <8 x double> %op1 to <8 x half> ret <8 x half> %res diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -658,9 +658,9 @@ ; VBITS_EQ_256-DAG: fcvtzu [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d ; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s ; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h ; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h -; VBITS_EQ_256-NEXT: mov v0.d[1], v[[RES_HI]].d[0] +; VBITS_EQ_256-NEXT: zip1 v0.2d, v[[RES_LO:[0-9]+]].2d, v[[RES_HI:[0-9]+]].2d ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptoui <8 x double> %op1 to <8 x i16> @@ -1526,9 +1526,9 @@ ; VBITS_EQ_256-DAG: fcvtzs [[CVT_LO:z[0-9]+]].d, [[PG2]]/m, [[LO]].d ; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s ; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h ; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h -; VBITS_EQ_256-NEXT: mov v0.d[1], v[[RES_HI]].d[0] +; VBITS_EQ_256-NEXT: zip1 v0.2d, v[[RES_LO:[0-9]+]].2d, v[[RES_HI:[0-9]+]].2d ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x double>, <8 x double>* %a %res = fptosi <8 x double> %op1 to <8 x i16> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -657,9 +657,9 @@ ; VBITS_EQ_256-DAG: ucvtf [[CVT_HI:z[0-9]+]].h, [[PG2]]/m, [[HI]].d ; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[CVT_LO]].s, [[CVT_LO]].s ; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s -; VBITS_EQ_256-DAG: uzp1 z0.h, [[UZP_LO]].h, [[UZP_LO]].h +; VBITS_EQ_256-DAG: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h ; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h -; VBITS_EQ_256-NEXT: mov v0.d[1], v[[RES_HI]].d[0] +; VBITS_EQ_256-DAG: zip1 v0.2d, v[[RES_LO:[0-9]+]].2d, v[[RES_HI:[0-9]+]].2d ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %res = uitofp <8 x i64> %op1 to <8 x half> @@ -1526,7 +1526,7 @@ ; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[CVT_HI]].s, [[CVT_HI]].s ; VBITS_EQ_256-DAG: uzp1 z[[RES_LO:[0-9]+]].h, [[UZP_LO]].h, [[UZP_LO]].h ; VBITS_EQ_256-DAG: uzp1 z[[RES_HI:[0-9]+]].h, [[UZP_HI]].h, [[UZP_HI]].h -; VBITS_EQ_256-NEXT: mov v[[RES_LO]].d[1], v[[RES_HI]].d[0] +; VBITS_EQ_256-NEXT: zip1 v0.2d, v[[RES_LO]].2d, v[[RES_HI]].2d ; VBITS_EQ_256-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %res = sitofp <8 x i64> %op1 to <8 x half> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -260,8 +260,8 @@ ; VBITS_EQ_256-DAG: uzp1 z[[UZP2_LO:[0-9]+]].h, [[UZP1_LO]].h, [[UZP1_LO]].h ; VBITS_EQ_256-DAG: uzp1 [[UZP1_HI:z[0-9]+]].s, [[RES_HI]].s, [[RES_HI]].s ; VBITS_EQ_256-DAG: uzp1 z[[UZP2_HI:[0-9]+]].h, [[UZP1_HI]].h, [[UZP1_HI]].h -; VBITS_EQ_256-NEXT: mov v[[UZP2_LO]].d[1], v[[UZP2_HI]].d[0] -; VBITS_EQ_256-NEXT: str q[[UZP2_LO]], [x0] +; VBITS_EQ_256-NEXT: zip1 v[[RESULT:[0-9]+]].2d, v[[UZP2_LO]].2d, v[[UZP2_HI]].2d +; VBITS_EQ_256-NEXT: str q[[RESULT]], [x0] ; VBITS_EQ_256-NEXT: ret %cval = load <8 x i16>, <8 x i16>* %a %ptrs = load <8 x i16*>, <8 x i16*>* %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll @@ -110,8 +110,8 @@ ; VBITS_EQ_256-DAG: uzp1 [[WORDS_HI:z[0-9]+]].s, [[DWORDS_HI]].s, [[DWORDS_HI]].s ; VBITS_EQ_256-DAG: uzp1 z[[HALFS_LO:[0-9]+]].h, [[WORDS_LO]].h, [[WORDS_LO]].h ; VBITS_EQ_256-DAG: uzp1 z[[HALFS_HI:[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h -; VBITS_EQ_256-NEXT: mov v[[HALFS_LO]].d[1], v[[HALFS_HI]].d[0] -; VBITS_EQ_256-NEXT: str q[[HALFS_LO]], [x1] +; VBITS_EQ_256-NEXT: zip1 v[[HALFS_STORE:[0-9]+]].2d, v[[HALFS_LO]].2d, v[[HALFS_HI]].2d +; VBITS_EQ_256-NEXT: str q[[HALFS_STORE]], [x1] ; VBITS_EQ_256-NEXT: ret %a = load <8 x i64>, <8 x i64>* %ap %val = trunc <8 x i64> %a to <8 x i16> @@ -161,8 +161,8 @@ ; VBITS_EQ_256-DAG: uzp1 [[HALFS_HI:z[0-9]+]].h, [[WORDS_HI]].h, [[WORDS_HI]].h ; VBITS_EQ_256-DAG: uzp1 z[[BYTES_LO:[0-9]+]].b, [[HALFS_LO]].b, [[HALFS_LO]].b ; VBITS_EQ_256-DAG: uzp1 z[[BYTES_HI:[0-9]+]].b, [[HALFS_HI]].b, [[HALFS_HI]].b -; VBITS_EQ_256-NEXT: mov v[[BYTES_LO]].d[1], v[[BYTES_HI]].d[0] -; VBITS_EQ_256-NEXT: str q[[BYTES_LO]], [x1] +; VBITS_EQ_256-NEXT: zip1 v[[BYTES_STORE:[0-9]+]].2d, v[[BYTES_LO]].2d, v[[BYTES_HI]].2d +; VBITS_EQ_256-NEXT: str q[[BYTES_STORE]], [x1] ; VBITS_EQ_256-NEXT: ret %a = load <16 x i32>, <16 x i32>* %ap %val = trunc <16 x i32> %a to <16 x i8> diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll --- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll @@ -418,7 +418,7 @@ ; FP16-NEXT: fcvtn v2.4h, v2.4s ; FP16-NEXT: fcvtn v1.4h, v1.4s -; FP16-NEXT: mov.d v1[1], v2[0] +; FP16-NEXT: zip1.2d v1, v1, v2 ; FP16-NEXT: movi.8h v2, #128, lsl #8 ; FP16-NEXT: bit.16b v0, v1, v2 ; FP16-NEXT: ret