llvm · Oct 11, 2013
diff --git a/‎llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Lines changed: 194 additions & 4 deletions b/‎llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Lines changed: 194 additions & 4 deletions
diff --git a/‎llvm/lib/Target/AArch64/AArch64ISelLowering.h
Lines changed: 7 additions & 3 deletions b/‎llvm/lib/Target/AArch64/AArch64ISelLowering.h
Lines changed: 7 additions & 3 deletions
diff --git a/‎llvm/lib/Target/AArch64/AArch64InstrNEON.td
Lines changed: 263 additions & 49 deletions b/‎llvm/lib/Target/AArch64/AArch64InstrNEON.td
Lines changed: 263 additions & 49 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/neon-copy.ll
Lines changed: 188 additions & 0 deletions b/‎llvm/test/CodeGen/AArch64/neon-copy.ll
Lines changed: 188 additions & 0 deletions
diff --git a/‎llvm/test/MC/AArch64/neon-diagnostics.s
Lines changed: 185 additions & 0 deletions b/‎llvm/test/MC/AArch64/neon-diagnostics.s
Lines changed: 185 additions & 0 deletions
diff --git a/‎llvm/test/MC/AArch64/neon-simd-copy.s
Lines changed: 38 additions & 0 deletions b/‎llvm/test/MC/AArch64/neon-simd-copy.s
Lines changed: 38 additions & 0 deletions
@@ -297,15 +297,23 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
     setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
 
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
 
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
     setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
     setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
     setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
@@ -866,12 +874,12 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "AArch64ISD::NEON_CMPZ";
   case AArch64ISD::NEON_TST:
     return "AArch64ISD::NEON_TST";
-  case AArch64ISD::NEON_DUPIMM:
-    return "AArch64ISD::NEON_DUPIMM";
   case AArch64ISD::NEON_QSHLs:
     return "AArch64ISD::NEON_QSHLs";
   case AArch64ISD::NEON_QSHLu:
     return "AArch64ISD::NEON_QSHLu";
+  case AArch64ISD::NEON_VDUP:
+    return "AArch64ISD::NEON_VDUP";
   case AArch64ISD::NEON_VDUPLANE:
     return "AArch64ISD::NEON_VDUPLANE";
   default:
@@ -3342,7 +3350,7 @@ static SDValue PerformShiftCombine(SDNode *N,
   case ISD::SHL:
     if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
       SDValue RHS =
-          DAG.getNode(AArch64ISD::NEON_DUPIMM, SDLoc(N->getOperand(1)), VT,
+          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
                       DAG.getConstant(Cnt, MVT::i32));
       return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
     }
@@ -3352,7 +3360,7 @@ static SDValue PerformShiftCombine(SDNode *N,
   case ISD::SRL:
     if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
       SDValue RHS =
-          DAG.getNode(AArch64ISD::NEON_DUPIMM, SDLoc(N->getOperand(1)), VT,
+          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
                       DAG.getConstant(Cnt, MVT::i32));
       return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
     }
@@ -3492,13 +3500,115 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       }
     }
   }
+
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
+  bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
+  SDValue Value;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
+      Value = V;
+    }
+  }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
+
+  if (ValueCounts.size() == 0)
+    return DAG.getUNDEF(VT);
+
+  // Loads are better lowered with insert_vector_elt.
+  // Keep going if we are hitting this case.
+  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  // Use VDUP for non-constant splats.
+  if (hasDominantValue && EltSize <= 64) {
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are DUPing a value that comes directly from a vector, we could
+      // just use DUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the DUP from lane instructions only have
+      // constant-index forms.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(Value->getOperand(1))) {
+          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+      } else
+        N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i32));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
+        }
+      }
+      return N;
+    }
+    if (usesOnlyOneValue && isConstant) {
+      return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+    }
+  }
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    for (unsigned i = 0 ; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
   return SDValue();
 }
 
 SDValue
 AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                 SelectionDAG &DAG) const {
   SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
@@ -3516,9 +3626,89 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
       // If this is undef splat, generate it via "just" vdup, if possible.
       if (Lane == -1) Lane = 0;
 
+      // Test if V1 is a SCALAR_TO_VECTOR.
+      if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+        return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
+      }
+      // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
+      if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+        bool IsScalarToVector = true;
+        for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
+          if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
+              i != (unsigned)Lane) {
+            IsScalarToVector = false;
+            break;
+          }
+        if (IsScalarToVector)
+          return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
+                             V1.getOperand(Lane));
+      }
       return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
                          DAG.getConstant(Lane, MVT::i64));
     }
+    // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
+    // by element from V2 to V1 .
+    // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
+    // better choice to be inserted than V1 as less insert needed, so we count
+    // element to be inserted for both V1 and V2, and select less one as insert
+    // target.
+
+    // Collect elements need to be inserted and their index.
+    SmallVector<int, 8> NV1Elt;
+    SmallVector<int, 8> N1Index;
+    SmallVector<int, 8> NV2Elt;
+    SmallVector<int, 8> N2Index;
+    int Length = ShuffleMask.size();
+    int V1EltNum = V1.getValueType().getVectorNumElements();
+    for (int I = 0; I != Length; ++I) {
+      if (ShuffleMask[I] != I) {
+        NV1Elt.push_back(ShuffleMask[I]);
+        N1Index.push_back(I);
+      }
+    }
+    for (int I = 0; I != Length; ++I) {
+      if (ShuffleMask[I] != (I + V1EltNum)) {
+        NV2Elt.push_back(ShuffleMask[I]);
+        N2Index.push_back(I);
+      }
+    }
+
+    // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
+    // will be inserted.
+    SDValue InsV = V1;
+    SmallVector<int, 8> InsMasks = NV1Elt;
+    SmallVector<int, 8> InsIndex = N1Index;
+    if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
+      if (NV1Elt.size() > NV2Elt.size()) {
+        InsV = V2;
+        InsMasks = NV2Elt;
+        InsIndex = N2Index;
+      }
+    } else {
+      InsV = DAG.getNode(ISD::UNDEF, dl, VT);
+    }
+
+    SDValue PassN;
+
+    for (int I = 0, E = InsMasks.size(); I != E; ++I) {
+      SDValue ExtV = V1;
+      int Mask = InsMasks[I];
+      if (Mask > V1EltNum) {
+        ExtV = V2;
+        Mask -= V1EltNum;
+      }
+      // Any value type smaller than i32 is illegal in AArch64, and this lower
+      // function is called after legalize pass, so we need to legalize
+      // the result here.
+      EVT EltVT = MVT::i32;
+      if(EltSize == 64)
+        EltVT = MVT::i64;
+      PassN = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
+                          DAG.getConstant(Mask, MVT::i64));
+      PassN = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, PassN,
+                          DAG.getConstant(InsIndex[I], MVT::i64));
+    }
+    return PassN;
   }
 
   return SDValue();
 
@@ -134,13 +134,13 @@ namespace AArch64ISD {
     // Vector compare bitwise test
     NEON_TST,
 
-    // Operation for the immediate in vector shift
-    NEON_DUPIMM,
-
     // Vector saturating shift
     NEON_QSHLs,
     NEON_QSHLu,
 
+    // Vector dup
+    NEON_VDUP,
+
     // Vector dup by lane
     NEON_VDUPLANE
   };
@@ -296,6 +296,10 @@ enum NeonModImmType {
   Neon_Mov_Imm,
   Neon_Mvn_Imm
 };
+
+extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement,
+                                bool &usesOnlyOneValue, bool &hasDominantValue,
+                                bool &isConstant, bool &isUNDEF);
 } // namespace llvm
 
 #endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
@@ -225,8 +225,196 @@ define i64 @smovx2s(<2 x i32> %tmp1) {
   ret i64 %tmp4
 }
 
+define <8 x i8> @test_vcopy_lane_s8(<8 x i8> %v1, <8 x i8> %v2) {
+;CHECK: ins  {{v[0-9]+}}.b[5], {{v[0-9]+}}.b[3]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 11, i32 6, i32 7>
+  ret <8 x i8> %vset_lane
+}
+
+define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %v1, <16 x i8> %v2) {
+;CHECK: ins  {{v[0-9]+}}.b[14], {{v[0-9]+}}.b[6]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 22, i32 15>
+  ret <16 x i8> %vset_lane
+}
+
+define <8 x i8> @test_vcopy_lane_swap_s8(<8 x i8> %v1, <8 x i8> %v2) {
+;CHECK: ins {{v[0-9]+}}.b[7], {{v[0-9]+}}.b[0]
+  %vset_lane = shufflevector <8 x i8> %v1, <8 x i8> %v2, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <8 x i8> %vset_lane
+}
+
+define <16 x i8> @test_vcopyq_laneq_swap_s8(<16 x i8> %v1, <16 x i8> %v2) {
+;CHECK: ins {{v[0-9]+}}.b[0], {{v[0-9]+}}.b[15]
+  %vset_lane = shufflevector <16 x i8> %v1, <16 x i8> %v2, <16 x i32> <i32 15, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i8> %vset_lane
+}
+
+define <8 x i8> @test_vdup_n_u8(i8 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i8> %vecinit6.i, i8 %v1, i32 7
+  ret <8 x i8> %vecinit7.i
+}
+
+define <4 x i16> @test_vdup_n_u16(i16 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %v1, i32 3
+  ret <4 x i16> %vecinit3.i
+}
+
+define <2 x i32> @test_vdup_n_u32(i32 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{w[0-9]+}}
+  %vecinit.i = insertelement <2 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %v1, i32 1
+  ret <2 x i32> %vecinit1.i
+}
+
+define <1 x i64> @test_vdup_n_u64(i64 %v1) #0 {
+;CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
+  %vecinit.i = insertelement <1 x i64> undef, i64 %v1, i32 0
+  ret <1 x i64> %vecinit.i
+}
 
+define <16 x i8> @test_vdupq_n_u8(i8 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.16b, {{w[0-9]+}}
+  %vecinit.i = insertelement <16 x i8> undef, i8 %v1, i32 0
+  %vecinit1.i = insertelement <16 x i8> %vecinit.i, i8 %v1, i32 1
+  %vecinit2.i = insertelement <16 x i8> %vecinit1.i, i8 %v1, i32 2
+  %vecinit3.i = insertelement <16 x i8> %vecinit2.i, i8 %v1, i32 3
+  %vecinit4.i = insertelement <16 x i8> %vecinit3.i, i8 %v1, i32 4
+  %vecinit5.i = insertelement <16 x i8> %vecinit4.i, i8 %v1, i32 5
+  %vecinit6.i = insertelement <16 x i8> %vecinit5.i, i8 %v1, i32 6
+  %vecinit7.i = insertelement <16 x i8> %vecinit6.i, i8 %v1, i32 7
+  %vecinit8.i = insertelement <16 x i8> %vecinit7.i, i8 %v1, i32 8
+  %vecinit9.i = insertelement <16 x i8> %vecinit8.i, i8 %v1, i32 9
+  %vecinit10.i = insertelement <16 x i8> %vecinit9.i, i8 %v1, i32 10
+  %vecinit11.i = insertelement <16 x i8> %vecinit10.i, i8 %v1, i32 11
+  %vecinit12.i = insertelement <16 x i8> %vecinit11.i, i8 %v1, i32 12
+  %vecinit13.i = insertelement <16 x i8> %vecinit12.i, i8 %v1, i32 13
+  %vecinit14.i = insertelement <16 x i8> %vecinit13.i, i8 %v1, i32 14
+  %vecinit15.i = insertelement <16 x i8> %vecinit14.i, i8 %v1, i32 15
+  ret <16 x i8> %vecinit15.i
+}
 
+define <8 x i16> @test_vdupq_n_u16(i16 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}}
+  %vecinit.i = insertelement <8 x i16> undef, i16 %v1, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %v1, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %v1, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %v1, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %v1, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %v1, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %v1, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %v1, i32 7
+  ret <8 x i16> %vecinit7.i
+}
 
+define <4 x i32> @test_vdupq_n_u32(i32 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
+  %vecinit.i = insertelement <4 x i32> undef, i32 %v1, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %v1, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %v1, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %v1, i32 3
+  ret <4 x i32> %vecinit3.i
+}
+
+define <2 x i64> @test_vdupq_n_u64(i64 %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
+  %vecinit.i = insertelement <2 x i64> undef, i64 %v1, i32 0
+  %vecinit1.i = insertelement <2 x i64> %vecinit.i, i64 %v1, i32 1
+  ret <2 x i64> %vecinit1.i
+}
 
+define <8 x i8> @test_vdup_lane_s8(<8 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_lane_s16(<4 x i16> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_lane_s32(<2 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %v1) #0 {
+;CHECK: {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <8 x i8> %v1, <8 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %v1) #0 {
+;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <4 x i16> %v1, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %v1) #0 {
+;CHECK: {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <2 x i32> %v1, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
+;CHECK: {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <1 x i64> %v1, <1 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
+
+define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <8 x i8> %shuffle
+}
+
+define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i16> %shuffle
+}
+
+define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i32> %shuffle
+}
+
+define <16 x i8> @test_vdupq_laneq_s8(<16 x i8> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.16b, {{v[0-9]+}}.b[5]
+  %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  ret <16 x i8> %shuffle
+}
+
+define <8 x i16> @test_vdupq_laneq_s16(<8 x i16> %v1) #0 {
+;CHECK: {{v[0-9]+}}.8h, {{v[0-9]+}}.h[2]
+  %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x i16> %shuffle
+}
+
+define <4 x i32> @test_vdupq_laneq_s32(<4 x i32> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
+  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %shuffle
+}
+
+define <2 x i64> @test_vdupq_laneq_s64(<2 x i64> %v1) #0 {
+;CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
+  %shuffle = shufflevector <2 x i64> %v1, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %shuffle
+}
 
@@ -4101,3 +4101,188 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:        st4 {v31.2s-v1.2s}, [x31]
 // CHECK-ERROR:            ^
+
+         ins v2.b[16], w1
+         ins v7.h[8], w14
+         ins v20.s[5], w30
+         ins v1.d[2], x7
+         ins v2.b[3], b1
+         ins v7.h[2], h14
+         ins v20.s[1], s30
+         ins v1.d[0], d7
+
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v2.b[16], w1
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v7.h[8], w14
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v20.s[5], w30
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR:         ins v1.d[2], x7
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v2.b[3], b1
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v7.h[2], h14
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v20.s[1], s30
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         ins v1.d[0], d7
+// CHECK-ERROR:                      ^
+
+         smov w1, v0.b[16]
+         smov w14, v6.h[8]
+         smov x1, v0.b[16]
+         smov x14, v6.h[8]
+         smov x20, v9.s[5]
+         smov w1, v0.d[0]
+         smov w14, v6.d[1]
+         smov x1, v0.d[0]
+         smov x14, v6.d[1]
+         smov x20, v9.d[0]
+
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov w1, v0.b[16]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov w14, v6.h[8]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov x1, v0.b[16]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov x14, v6.h[8]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         smov x20, v9.s[5]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov w1, v0.d[0]
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov w14, v6.d[1]
+// CHECK-ERROR                      ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov x1, v0.d[0]
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov x14, v6.d[1]
+// CHECK-ERROR                      ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         smov x20, v9.d[0]
+// CHECK-ERROR                      ^
+
+         umov w1, v0.b[16]
+         umov w14, v6.h[8]
+         umov w20, v9.s[5]
+         umov x7, v18.d[3]
+         umov w1, v0.d[0]
+         umov s20, v9.s[2]
+         umov d7, v18.d[1]
+
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov w1, v0.b[16]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov w14, v6.h[8]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov w20, v9.s[5]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: lane number incompatible with layout
+// CHECK-ERROR         umov x7, v18.d[3]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         umov w1, v0.d[0]
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         umov s20, v9.s[2]
+// CHECK-ERROR              ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         umov d7, v18.d[1]
+// CHECK-ERROR              ^
+
+         Ins v1.h[2], v3.b[6]
+         Ins v6.h[7], v7.s[2]
+         Ins v15.d[0], v22.s[2]
+         Ins v0.d[0], v4.b[1]
+
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v1.h[2], v3.b[6]
+// CHECK-ERROR                         ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v6.h[7], v7.s[2]
+// CHECK-ERROR                         ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v15.d[0], v22.s[2]
+// CHECK-ERROR                           ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         Ins v0.d[0], v4.b[1]
+// CHECK-ERROR                         ^
+
+         dup v1.8h, v2.b[2]
+         dup v11.4s, v7.h[7]
+         dup v17.2d, v20.s[0]
+         dup v1.16b, v2.h[2]
+         dup v11.8h, v7.s[3]
+         dup v17.4s, v20.d[0]
+         dup v5.2d, v1.b[1]
+
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.8h, v2.b[2]
+// CHECK-ERROR                       ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v11.4s, v7.h[7]
+// CHECK-ERROR                        ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v17.2d, v20.s[0]
+// CHECK-ERROR                         ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.16b, v2.h[2]
+// CHECK-ERROR                        ^
+// CHECK-ERROR invalid operand for instruction
+// CHECK-ERROR         dup v11.8h, v7.s[3]
+// CHECK-ERROR                        ^
+// CHECK-ERROR invalid operand for instruction
+// CHECK-ERROR         dup v17.4s, v20.d[0]
+// CHECK-ERROR                         ^
+// CHECK-ERROR invalid operand for instruction
+// CHECK-ERROR         dup v5.2d, v1.b[1]
+// CHECK-ERROR                       ^
+
+         dup v1.8b, b1
+         dup v11.4h, h14
+         dup v17.2s, s30
+         dup v1.16b, d2
+         dup v11.8s, w16
+         dup v17.4d, w28
+         dup v5.2d, w0
+
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.8b, b1
+// CHECK-ERROR                    ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v11.4h, h14
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v17.2s, s30
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v1.16b, d2
+// CHECK-ERROR                     ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v11.8s, w16
+// CHECK-ERROR             ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v17.4d, w28
+// CHECK-ERROR             ^
+// CHECK-ERROR error: invalid operand for instruction
+// CHECK-ERROR         dup v5.2d, w0
+// CHECK-ERROR                    ^
+
@@ -60,6 +60,44 @@
 // CHECK: ins	v15.s[3], v22.s[2]      // encoding: [0xcf,0x5e,0x1c,0x6e]
 // CHECK: ins	v0.d[0], v4.d[1]        // encoding: [0x80,0x44,0x08,0x6e]
 
+//------------------------------------------------------------------------------
+// Duplicate to all lanes( vector, from element)
+//------------------------------------------------------------------------------
+         dup v1.8b, v2.b[2]
+         dup v11.4h, v7.h[7]
+         dup v17.2s, v20.s[0]
+         dup v1.16b, v2.b[2]
+         dup v11.8h, v7.h[7]
+         dup v17.4s, v20.s[0]
+         dup v5.2d, v1.d[1]         
+
+// CHECK: dup v1.8b, v2.b[2]        // encoding: [0x41,0x04,0x05,0x0e]
+// CHECK: dup v11.4h, v7.h[7]       // encoding: [0xeb,0x04,0x1e,0x0e]
+// CHECK: dup v17.2s, v20.s[0]      // encoding: [0x91,0x06,0x04,0x0e]
+// CHECK: dup v1.16b, v2.b[2]       // encoding: [0x41,0x04,0x05,0x4e]
+// CHECK: dup v11.8h, v7.h[7]       // encoding: [0xeb,0x04,0x1e,0x4e]
+// CHECK: dup v17.4s, v20.s[0]      // encoding: [0x91,0x06,0x04,0x4e]
+// CHECK: dup v5.2d, v1.d[1]        // encoding: [0x25,0x04,0x18,0x4e]
+
+//------------------------------------------------------------------------------
+// Duplicate to all lanes( vector, from main)
+//------------------------------------------------------------------------------
+         dup v1.8b, w1
+         dup v11.4h, w14
+         dup v17.2s, w30
+         dup v1.16b, w2
+         dup v11.8h, w16
+         dup v17.4s, w28
+         dup v5.2d, x0        
+
+// CHECK: dup	v1.8b, w1             // encoding: [0x21,0x0c,0x01,0x0e]
+// CHECK: dup	v11.4h, w14           // encoding: [0xcb,0x0d,0x0a,0x0e]
+// CHECK: dup	v17.2s, w30           // encoding: [0xd1,0x0f,0x14,0x0e]
+// CHECK: dup	v1.16b, w2            // encoding: [0x41,0x0c,0x01,0x4e]
+// CHECK: dup	v11.8h, w16           // encoding: [0x0b,0x0e,0x0a,0x4e]
+// CHECK: dup	v17.4s, w28           // encoding: [0x91,0x0f,0x14,0x4e]
+// CHECK: dup	v5.2d, x0             // encoding: [0x05,0x0c,0x08,0x4e]
+