diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -429,6 +429,57 @@
   default:
     break;
 
+  case AArch64::BSPv8i8:
+  case AArch64::BSPv16i8: {
+    Register DstReg = MI.getOperand(0).getReg();
+    if (DstReg == MI.getOperand(3).getReg()) {
+      // Expand to BIT
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
+                                                  : AArch64::BITv16i8))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(3))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(1));
+    } else if (DstReg == MI.getOperand(2).getReg()) {
+      // Expand to BIF
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
+                                                  : AArch64::BIFv16i8))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(3))
+          .add(MI.getOperand(1));
+    } else {
+      // Expand to BSL, use additional move if required
+      if (DstReg == MI.getOperand(1).getReg()) {
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+                                                    : AArch64::BSLv16i8))
+            .add(MI.getOperand(0))
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(2))
+            .add(MI.getOperand(3));
+      } else {
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
+                                                    : AArch64::ORRv16i8))
+            .addReg(DstReg)
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(1));
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+                                                    : AArch64::BSLv16i8))
+            .add(MI.getOperand(0))
+            .addReg(DstReg)
+            .add(MI.getOperand(2))
+            .add(MI.getOperand(3));
+      }
+    }
+    MI.eraseFromParent();
+    return true;
+  }
+
   case AArch64::ADDWrr:
   case AArch64::SUBWrr:
   case AArch64::ADDXrr:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -90,9 +90,9 @@
   BICi,
   ORRi,
 
-  // Vector bit select: similar to ISD::VSELECT but not all bits within an
+  // Vector bitwise select: similar to ISD::VSELECT but not all bits within an
   // element must be identical.
-  BSL,
+  BSP,
 
   // Vector arithmetic negation
   NEG,
@@ -166,7 +166,7 @@
   // Vector bitwise negation
   NOT,
 
-  // Vector bitwise selection
+  // Vector bitwise insertion
   BIT,
 
   // Compare-and-branch
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1287,7 +1287,7 @@
   case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
   case AArch64ISD::BICi:              return "AArch64ISD::BICi";
   case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
-  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
+  case AArch64ISD::BSP:               return "AArch64ISD::BSP";
   case AArch64ISD::NEG:               return "AArch64ISD::NEG";
   case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
   case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
@@ -10229,7 +10229,7 @@
       }
 
       if (FoundMatch)
-        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
     }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5207,6 +5207,47 @@
   let Inst{4-0}   = Rd;
 }
 
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorPseudo<RegisterOperand regtype, list<dag> pattern>
+  : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>,
+    Sched<[WriteV]>;
+
+multiclass SIMDLogicalThreeVectorPseudo<SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorPseudo<V64,
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorPseudo<V128,
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
 // All operand sizes distinguished in the encoding.
 multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
@@ -5427,7 +5468,7 @@
 }
 
 multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
-                                  string asm, SDPatternOperator OpNode> {
+                                  string asm, SDPatternOperator OpNode = null_frag> {
   def v8i8  : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
                                      asm, ".8b",
              [(set (v8i8 V64:$dst),
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -468,7 +468,7 @@
 
 def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
 def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
-def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>;
 
 def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
 def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
@@ -3955,33 +3955,53 @@
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
                                   BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
-defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
-defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
-defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
-    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
 defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
 defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
                                   BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
 defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
 
-
-def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+// Pseudo bitwise select pattern BSP.
+// It is expanded into BSL/BIT/BIF after register allocation.
+defm BSP : SIMDLogicalThreeVectorPseudo<TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif">;
+
+def : Pat<(AArch64bsp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(AArch64bit (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bit (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bit (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bit (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bit (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bit (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bit (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bit (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
 def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
                 (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td
--- a/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -501,7 +501,7 @@
 //   Q form - v16i8, v8i16, v4i32, v2i64
 
 // ASIMD bitwise insert, Q-form
-def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>;
+def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>;
 
 // ASIMD duplicate, gen reg, D-form and Q-form
 def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
--- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -494,7 +494,7 @@
 // WriteV includes:
 // SHLL,SSHLL,USHLL
 // SLI,SRI
-// BIF,BIT,BSL
+// BIF,BIT,BSL,BSP
 // EXT
 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
 // XTN2
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -660,7 +660,7 @@
 
 // ASIMD miscellaneous instructions.
 def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>;
-def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>;
 def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>;
 def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -803,7 +803,7 @@
 
 // ASIMD miscellaneous instructions.
 def : InstRW<[M4WriteNALU1],  (instregex "^RBITv")>;
-def : InstRW<[M4WriteNALU1],  (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M4WriteNALU1],  (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M4WriteNALU1],  (instregex "^CL[STZ]v")>;
 def : InstRW<[M4WriteNEONB],  (instregex "^DUPv.+gpr")>;
 def : InstRW<[M4WriteNSHF1],  (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -841,7 +841,7 @@
 
 // ASIMD miscellaneous instructions.
 def : InstRW<[M5WriteNALU2],  (instregex "^RBITv")>;
-def : InstRW<[M5WriteNALU2],  (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M5WriteNALU2],  (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M5WriteNALU2],  (instregex "^CL[STZ]v")>;
 def : InstRW<[M5WriteNEONB],  (instregex "^DUPv.+gpr")>;
 def : InstRW<[M5WriteNSHF2],  (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
--- a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -911,7 +911,7 @@
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^CPY(i8|i16|i32|i64)$")>;
 def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^INSv(i8|i16)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^(S|U)MOVv.*$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs EXTv8i8)>;
 def : InstRW<[FalkorWr_1VXVY_0cyc],   (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs TBLv8i8One)>;
@@ -935,7 +935,7 @@
 def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],
                                       (instregex "^INSv(i32|i64)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_2GTOV_1cyc],   (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
-def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v16i8$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIF|BIT|BSL|BSP)v16i8$")>;
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs EXTv16i8)>;
 def : InstRW<[FalkorWr_2VXVY_0cyc],   (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs NOTv16i8)>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
--- a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -462,13 +462,13 @@
 	let Latency = 1; let NumMicroOps = 2;
 }
 def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln],
-	(instrs BIFv8i8, BITv8i8, BSLv8i8)>;
+	(instrs BIFv8i8, BITv8i8, BSLv8i8, BSPv8i8)>;
 def KryoWrite_1cyc_X_X_75ln :
 	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
 	let Latency = 1; let NumMicroOps = 2;
 }
 def : InstRW<[KryoWrite_1cyc_X_X_75ln],
-	(instrs BIFv16i8, BITv16i8, BSLv16i8)>;
+	(instrs BIFv16i8, BITv16i8, BSLv16i8, BSPv16i8)>;
 def KryoWrite_0cyc_noRSV_11ln :
 	SchedWriteRes<[]> {
 	let Latency = 0; let NumMicroOps = 1;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -1482,7 +1482,7 @@
 // ASIMD bitwise insert, D-form
 // ASIMD bitwise insert, Q-form
 def : InstRW<[THX2T99Write_5Cyc_F01],
-            (instregex "^BIFv", "^BITv", "^BSLv")>;
+            (instregex "^BIFv", "^BITv", "^BSLv", "^BSPv")>;
 
 // ASIMD count, D-form
 // ASIMD count, Q-form
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; BIF Bitwise Insert if False
+;
+; 8-bit vectors tests
+
+define <1 x i8> @test_bitf_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) {
+; CHECK-LABEL: test_bitf_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i8> %C, <i8 -1>
+  %and = and <1 x i8> %neg, %B
+  %and1 = and <1 x i8> %C, %A
+  %or = or <1 x i8> %and, %and1
+  ret <1 x i8> %or
+}
+
+; 16-bit vectors tests
+
+define <1 x i16> @test_bitf_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) {
+; CHECK-LABEL: test_bitf_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i16> %C, <i16 -1>
+  %and = and <1 x i16> %neg, %B
+  %and1 = and <1 x i16> %C, %A
+  %or = or <1 x i16> %and, %and1
+  ret <1 x i16> %or
+}
+
+; 32-bit vectors tests
+
+define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
+; CHECK-LABEL: test_bitf_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i32> %C, <i32 -1>
+  %and = and <1 x i32> %neg, %B
+  %and1 = and <1 x i32> %C, %A
+  %or = or <1 x i32> %and, %and1
+  ret <1 x i32> %or
+}
+
+; 64-bit vectors tests
+
+define <1 x i64> @test_bitf_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) {
+; CHECK-LABEL: test_bitf_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i64> %C, <i64 -1>
+  %and = and <1 x i64> %neg, %B
+  %and1 = and <1 x i64> %C, %A
+  %or = or <1 x i64> %and, %and1
+  ret <1 x i64> %or
+}
+
+define <2 x i32> @test_bitf_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+; CHECK-LABEL: test_bitf_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <2 x i32> %C, <i32 -1, i32 -1>
+  %and = and <2 x i32> %neg, %B
+  %and1 = and <2 x i32> %C, %A
+  %or = or <2 x i32> %and, %and1
+  ret <2 x i32> %or
+}
+
+define <4 x i16> @test_bitf_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
+; CHECK-LABEL: test_bitf_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <4 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %and = and <4 x i16> %neg, %B
+  %and1 = and <4 x i16> %C, %A
+  %or = or <4 x i16> %and, %and1
+  ret <4 x i16> %or
+}
+
+define <8 x i8> @test_bitf_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
+; CHECK-LABEL: test_bitf_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <8 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and = and <8 x i8> %neg, %B
+  %and1 = and <8 x i8> %C, %A
+  %or = or <8 x i8> %and, %and1
+  ret <8 x i8> %or
+}
+
+; 128-bit vectors tests
+
+define <2 x i64> @test_bitf_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: test_bitf_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <2 x i64> %C, <i64 -1, i64 -1>
+  %and = and <2 x i64> %neg, %B
+  %and1 = and <2 x i64> %C, %A
+  %or = or <2 x i64> %and, %and1
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @test_bitf_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_bitf_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and = and <4 x i32> %neg, %B
+  %and1 = and <4 x i32> %C, %A
+  %or = or <4 x i32> %and, %and1
+  ret <4 x i32> %or
+}
+
+define <8 x i16> @test_bitf_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: test_bitf_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %and = and <8 x i16> %neg, %B
+  %and1 = and <8 x i16> %C, %A
+  %or = or <8 x i16> %and, %and1
+  ret <8 x i16> %or
+}
+
+define <16 x i8> @test_bitf_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: test_bitf_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and = and <16 x i8> %neg, %B
+  %and1 = and <16 x i8> %C, %A
+  %or = or <16 x i8> %and, %and1
+  ret <16 x i8> %or
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; BIT Bitwise Insert if True
+;
+; 8-bit vectors tests
+
+define <1 x i8> @test_bit_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) {
+; CHECK-LABEL: test_bit_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i8> %C, %B
+  %neg = xor <1 x i8> %C, <i8 -1>
+  %and1 = and <1 x i8> %neg, %A
+  %or = or <1 x i8> %and, %and1
+  ret <1 x i8> %or
+}
+
+; 16-bit vectors tests
+
+define <1 x i16> @test_bit_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) {
+; CHECK-LABEL: test_bit_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i16> %C, %B
+  %neg = xor <1 x i16> %C, <i16 -1>
+  %and1 = and <1 x i16> %neg, %A
+  %or = or <1 x i16> %and, %and1
+  ret <1 x i16> %or
+}
+
+; 32-bit vectors tests
+
+define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
+; CHECK-LABEL: test_bit_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i32> %C, %B
+  %neg = xor <1 x i32> %C, <i32 -1>
+  %and1 = and <1 x i32> %neg, %A
+  %or = or <1 x i32> %and, %and1
+  ret <1 x i32> %or
+}
+
+; 64-bit vectors tests
+
+define <1 x i64> @test_bit_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) {
+; CHECK-LABEL: test_bit_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i64> %C, %B
+  %neg = xor <1 x i64> %C, <i64 -1>
+  %and1 = and <1 x i64> %neg, %A
+  %or = or <1 x i64> %and, %and1
+  ret <1 x i64> %or
+}
+
+define <2 x i32> @test_bit_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+; CHECK-LABEL: test_bit_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <2 x i32> %C, %B
+  %neg = xor <2 x i32> %C, <i32 -1, i32 -1>
+  %and1 = and <2 x i32> %neg, %A
+  %or = or <2 x i32> %and, %and1
+  ret <2 x i32> %or
+}
+
+define <4 x i16> @test_bit_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
+; CHECK-LABEL: test_bit_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <4 x i16> %C, %B
+  %neg = xor <4 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %and1 = and <4 x i16> %neg, %A
+  %or = or <4 x i16> %and, %and1
+  ret <4 x i16> %or
+}
+
+define <8 x i8> @test_bit_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
+; CHECK-LABEL: test_bit_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <8 x i8> %C, %B
+  %neg = xor <8 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and1 = and <8 x i8> %neg, %A
+  %or = or <8 x i8> %and, %and1
+  ret <8 x i8> %or
+}
+
+; 128-bit vectors tests
+
+define <2 x i64> @test_bit_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: test_bit_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <2 x i64> %C, %B
+  %neg = xor <2 x i64> %C, <i64 -1, i64 -1>
+  %and1 = and <2 x i64> %neg, %A
+  %or = or <2 x i64> %and, %and1
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @test_bit_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_bit_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <4 x i32> %C, %B
+  %neg = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and1 = and <4 x i32> %neg, %A
+  %or = or <4 x i32> %and, %and1
+  ret <4 x i32> %or
+}
+
+define <8 x i16> @test_bit_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: test_bit_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <8 x i16> %C, %B
+  %neg = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %and1 = and <8 x i16> %neg, %A
+  %or = or <8 x i16> %and, %and1
+  ret <8 x i16> %or
+}
+
+define <16 x i8> @test_bit_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: test_bit_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <16 x i8> %C, %B
+  %neg = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and1 = and <16 x i8> %neg, %A
+  %or = or <16 x i8> %and, %and1
+  ret <16 x i8> %or
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
--- a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -9,8 +9,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.8b, v3.8b, v2.8b
 ; CHECK-NEXT:    dup v2.8b, v2.b[0]
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i8 %a, %b
   %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
@@ -49,8 +48,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    dup v2.16b, v2.b[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i8 %a, %b
   %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
@@ -92,8 +90,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.4h, v3.4h, v2.4h
 ; CHECK-NEXT:    dup v2.4h, v2.h[0]
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i16 %a, %b
   %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
@@ -107,8 +104,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.8h, v3.8h, v2.8h
 ; CHECK-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i16 %a, %b
   %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
@@ -122,8 +118,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.2s, v3.2s, v2.2s
 ; CHECK-NEXT:    dup v2.2s, v2.s[0]
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
@@ -137,8 +132,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
@@ -151,8 +145,7 @@
 ; CHECK-NEXT:    fmov d2, x1
 ; CHECK-NEXT:    fmov d3, x0
 ; CHECK-NEXT:    cmeq d2, d3, d2
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
@@ -166,8 +159,7 @@
 ; CHECK-NEXT:    fmov d3, x0
 ; CHECK-NEXT:    cmeq v2.2d, v3.2d, v2.2d
 ; CHECK-NEXT:    dup v2.2d, v2.d[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
@@ -222,8 +214,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <4x float> %c, <4x float> %d
@@ -247,8 +238,7 @@
 ; CHECK-NEXT:    fmov d2, x1
 ; CHECK-NEXT:    fmov d3, x0
 ; CHECK-NEXT:    cmeq d2, d3, d2
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
@@ -278,8 +268,7 @@
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csetm w8, ne
 ; CHECK-NEXT:    dup v2.2s, w8
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp = icmp ne i1 %cc, 0
   %e = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
@@ -294,8 +283,7 @@
 ; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
 ; CHECK-NEXT:    fcmeq v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cc = fcmp oeq float %c1, %c2
   %r = select i1 %cc, <3 x float> %a, <3 x float> %b
@@ -309,8 +297,7 @@
 ; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
 ; CHECK-NEXT:    fcmeq v2.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    dup v2.2d, v2.d[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cc = fcmp oeq double %c1, %c2
   %r = select i1 %cc, <3 x float> %a, <3 x float> %b
diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
--- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
@@ -5,8 +5,7 @@
 define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 {
 ; CHECK-LABEL: select_64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 entry:
   %0 = bitcast <4 x half> %a to <4 x i16>
@@ -23,8 +22,7 @@
 define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 {
 ; CHECK-LABEL: select_128:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %0 = bitcast <8 x half> %a to <8 x i16>
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -61,8 +61,7 @@
 ; CHECK-LABEL: bsl8xi8_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 >
 	%tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1 >
@@ -74,8 +73,7 @@
 ; CHECK-LABEL: bsl16xi8_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0 >
 	%tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1 >
@@ -664,8 +662,7 @@
 ; CHECK-LABEL: bsl2xi32_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d2, #0x000000ffffffff
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <2 x i32> %a, < i32 -1, i32 0 >
 	%tmp2 = and <2 x i32> %b, < i32 0, i32 -1 >
@@ -678,8 +675,7 @@
 ; CHECK-LABEL: bsl4xi16_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <4 x i16> %a, < i16 -1, i16 0, i16 -1,i16 0 >
 	%tmp2 = and <4 x i16> %b, < i16 0, i16 -1,i16 0, i16 -1 >
@@ -691,8 +687,7 @@
 ; CHECK-LABEL: bsl1xi64_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi d2, #0xffffffffffffff00
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <1 x i64> %a, < i64 -256 >
 	%tmp2 = and <1 x i64> %b, < i64 255 >
@@ -704,8 +699,7 @@
 ; CHECK-LABEL: bsl4xi32_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <4 x i32> %a, < i32 -1, i32 0, i32 -1, i32 0 >
 	%tmp2 = and <4 x i32> %b, < i32 0, i32 -1, i32 0, i32 -1 >
@@ -717,8 +711,7 @@
 ; CHECK-LABEL: bsl8xi16_const:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 0,i16 0, i16 -1, i16 -1, i16 0,i16 0 >
 	%tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 -1, i16 -1 >
@@ -731,8 +724,7 @@
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adrp x8, .LCPI75_0
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI75_0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 	%tmp1 = and <2 x i64> %a, < i64 -1, i64 0 >
 	%tmp2 = and <2 x i64> %b, < i64 0, i64 -1 >
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -480,9 +480,9 @@
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    mov w9, #42
 ; CHECK-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    dup v0.2d, x9
-; CHECK-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    dup v1.2d, x9
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %c = icmp ult <2 x i64> %x, <i64 -43, i64 -43>
   %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> <i64 -43, i64 -43>
@@ -653,8 +653,8 @@
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v2.16b, v1.16b
 ; CHECK-NEXT:    cmhi v3.2d, v2.2d, v0.2d
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v0.2d, v3.2d, v1.2d
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %noty = xor <2 x i64> %y, <i64 -1, i64 -1>
   %c = icmp ult <2 x i64> %x, %noty
diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
--- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -71,10 +71,9 @@
 ; CHECK-NEXT:    fmul v2.2s, v1.2s, v1.2s
 ; CHECK-NEXT:    frsqrts v2.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    fmul v2.2s, v2.2s, v0.2s
-; CHECK-NEXT:    fmul v2.2s, v1.2s, v2.2s
-; CHECK-NEXT:    fcmeq v1.2s, v0.2s, #0.0
-; CHECK-NEXT:    bsl v1.8b, v0.8b, v2.8b
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v1.2s, v1.2s, v2.2s
+; CHECK-NEXT:    fcmeq v2.2s, v0.2s, #0.0
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a)
   ret <2 x float> %1
@@ -95,10 +94,9 @@
 ; CHECK-NEXT:    fmul v2.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    frsqrts v2.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    fmul v2.4s, v2.4s, v0.4s
-; CHECK-NEXT:    fmul v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fcmeq v1.4s, v0.4s, #0.0
-; CHECK-NEXT:    bsl v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fcmeq v2.4s, v0.4s, #0.0
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
   ret <4 x float> %1
@@ -120,21 +118,19 @@
 ; CHECK-NEXT:    fmul v3.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    frsqrts v3.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    fmul v3.4s, v3.4s, v0.4s
-; CHECK-NEXT:    fmul v3.4s, v2.4s, v3.4s
-; CHECK-NEXT:    fcmeq v2.4s, v0.4s, #0.0
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v3.16b
-; CHECK-NEXT:    frsqrte v0.4s, v1.4s
-; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, #0.0
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    frsqrte v2.4s, v1.4s
+; CHECK-NEXT:    fmul v3.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    frsqrts v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fmul v3.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    frsqrts v3.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    fmul v3.4s, v3.4s, v1.4s
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    fcmeq v3.4s, v1.4s, #0.0
-; CHECK-NEXT:    bsl v3.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a)
   ret <8 x float> %1
@@ -210,10 +206,9 @@
 ; CHECK-NEXT:    fmul v2.2d, v1.2d, v1.2d
 ; CHECK-NEXT:    frsqrts v2.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    fmul v2.2d, v2.2d, v0.2d
-; CHECK-NEXT:    fmul v2.2d, v1.2d, v2.2d
-; CHECK-NEXT:    fcmeq v1.2d, v0.2d, #0.0
-; CHECK-NEXT:    bsl v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    fcmeq v2.2d, v0.2d, #0.0
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
   ret <2 x double> %1
@@ -238,24 +233,22 @@
 ; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v0.2d, v3.2d
 ; CHECK-NEXT:    fmul v3.2d, v3.2d, v0.2d
-; CHECK-NEXT:    fmul v3.2d, v2.2d, v3.2d
-; CHECK-NEXT:    fcmeq v2.2d, v0.2d, #0.0
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v3.16b
-; CHECK-NEXT:    frsqrte v0.2d, v1.2d
-; CHECK-NEXT:    fmul v3.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fcmeq v3.2d, v0.2d, #0.0
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    frsqrte v2.2d, v1.2d
+; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v1.2d, v3.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    fmul v3.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v1.2d, v3.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    fmul v3.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v1.2d, v3.2d
 ; CHECK-NEXT:    fmul v3.2d, v3.2d, v1.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v3.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    fcmeq v3.2d, v1.2d, #0.0
-; CHECK-NEXT:    bsl v3.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a)
   ret <4 x double> %1
diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
@@ -62,8 +62,7 @@
 ; CHECK-LABEL: out_constant_varx_42:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %mask, %x
@@ -76,8 +75,7 @@
 ; CHECK-LABEL: in_constant_varx_42:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
   %n1 = and <4 x i32> %n0, %mask
@@ -90,8 +88,7 @@
 ; CHECK-LABEL: out_constant_varx_42_invmask:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %notmask, %x
@@ -105,8 +102,7 @@
 ; CHECK-LABEL: in_constant_varx_42_invmask:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
@@ -169,9 +165,8 @@
 define <4 x i32> @out_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: out_constant_42_vary:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
@@ -183,9 +178,8 @@
 define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: in_constant_42_vary:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
   %n1 = and <4 x i32> %n0, %mask
@@ -197,9 +191,8 @@
 define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: out_constant_42_vary_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
@@ -212,9 +205,8 @@
 define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: in_constant_42_vary_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
@@ -13,8 +13,7 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i8> %x, %mask
   %notmask = xor <1 x i8> %mask, <i8 -1>
@@ -46,8 +45,7 @@
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i16> %x, %mask
   %notmask = xor <1 x i16> %mask, <i16 -1>
@@ -111,8 +109,7 @@
 define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i32> %x, %mask
   %notmask = xor <1 x i32> %mask, <i32 -1>
@@ -128,8 +125,7 @@
 define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <8 x i8> %x, %mask
   %notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -141,8 +137,7 @@
 define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i16> %x, %mask
   %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
@@ -154,8 +149,7 @@
 define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v4i16_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i16> %x, %mask
   %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
@@ -167,8 +161,7 @@
 define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <2 x i32> %x, %mask
   %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
@@ -180,8 +173,7 @@
 define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i64> %x, %mask
   %notmask = xor <1 x i64> %mask, <i64 -1>
@@ -197,8 +189,7 @@
 define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <16 x i8> %x, %mask
   %notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -210,8 +201,7 @@
 define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <8 x i16> %x, %mask
   %notmask = xor <8 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -223,8 +213,7 @@
 define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -236,8 +225,7 @@
 define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
@@ -249,8 +237,7 @@
 define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <2 x i64> %x, %mask
   %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
@@ -270,8 +257,7 @@
 define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i8> %x, %y
   %n1 = and <1 x i8> %n0, %mask
@@ -286,8 +272,7 @@
 define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i8> %x, %y
   %n1 = and <2 x i8> %n0, %mask
@@ -298,8 +283,7 @@
 define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i16> %x, %y
   %n1 = and <1 x i16> %n0, %mask
@@ -314,8 +298,7 @@
 define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i8> %x, %y
   %n1 = and <4 x i8> %n0, %mask
@@ -326,8 +309,7 @@
 define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i16> %x, %y
   %n1 = and <2 x i16> %n0, %mask
@@ -338,8 +320,7 @@
 define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i32> %x, %y
   %n1 = and <1 x i32> %n0, %mask
@@ -354,8 +335,7 @@
 define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <8 x i8> %x, %y
   %n1 = and <8 x i8> %n0, %mask
@@ -366,8 +346,7 @@
 define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i16> %x, %y
   %n1 = and <4 x i16> %n0, %mask
@@ -378,8 +357,7 @@
 define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i32> %x, %y
   %n1 = and <2 x i32> %n0, %mask
@@ -390,8 +368,7 @@
 define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: in_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i64> %x, %y
   %n1 = and <1 x i64> %n0, %mask
@@ -406,8 +383,7 @@
 define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <16 x i8> %x, %y
   %n1 = and <16 x i8> %n0, %mask
@@ -418,8 +394,7 @@
 define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <8 x i16> %x, %y
   %n1 = and <8 x i16> %n0, %mask
@@ -430,8 +405,7 @@
 define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> %x, %y
   %n1 = and <4 x i32> %n0, %mask
@@ -442,8 +416,7 @@
 define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: in_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i64> %x, %y
   %n1 = and <2 x i64> %n0, %mask
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll
@@ -318,8 +318,8 @@
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI11_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -351,8 +351,8 @@
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI12_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -577,8 +577,8 @@
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI20_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -606,8 +606,8 @@
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI21_3]
 ; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -637,8 +637,8 @@
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI22_3]
 ; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -670,8 +670,8 @@
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI23_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -699,8 +699,8 @@
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI24_3]
 ; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -729,8 +729,8 @@
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI25_3]
 ; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
@@ -761,8 +761,8 @@
 ; CHECK-NEXT:    ldr q4, [x8, :lo12:.LCPI26_4]
 ; CHECK-NEXT:    neg v3.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v2.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v2.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
@@ -104,8 +104,8 @@
 ; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v5.4s
 ; CHECK-NEXT:    ldr q5, [x8, :lo12:.LCPI4_4]
 ; CHECK-NEXT:    ushl v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mls v0.4s, v3.4s, v4.4s
+; CHECK-NEXT:    bit v1.16b, v0.16b, v3.16b
+; CHECK-NEXT:    mls v0.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, v5.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret