diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -429,6 +429,57 @@
   default:
     break;
 
+  case AArch64::BSLPv8i8:
+  case AArch64::BSLPv16i8: {
+    Register DstReg = MI.getOperand(0).getReg();
+    if (DstReg == MI.getOperand(3).getReg()) {
+      // Expand to BIT
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Opcode == AArch64::BSLPv8i8 ? AArch64::BITv8i8
+                                                   : AArch64::BITv16i8))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(3))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(1));
+    } else if (DstReg == MI.getOperand(2).getReg()) {
+      // Expand to BIF
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Opcode == AArch64::BSLPv8i8 ? AArch64::BIFv8i8
+                                                   : AArch64::BIFv16i8))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(3))
+          .add(MI.getOperand(1));
+    } else {
+      // Expand to BSL, use additional move if required
+      if (DstReg == MI.getOperand(1).getReg()) {
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSLPv8i8 ? AArch64::BSLv8i8
+                                                     : AArch64::BSLv16i8))
+            .add(MI.getOperand(0))
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(2))
+            .add(MI.getOperand(3));
+      } else {
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSLPv8i8 ? AArch64::ORRv8i8
+                                                     : AArch64::ORRv16i8))
+            .addReg(DstReg)
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(1));
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSLPv8i8 ? AArch64::BSLv8i8
+                                                     : AArch64::BSLv16i8))
+            .add(MI.getOperand(0))
+            .addReg(DstReg)
+            .add(MI.getOperand(2))
+            .add(MI.getOperand(3));
+      }
+    }
+    MI.eraseFromParent();
+    return true;
+  }
+
   case AArch64::ADDWrr:
   case AArch64::SUBWrr:
   case AArch64::ADDXrr:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -92,7 +92,7 @@
 
   // Vector bit select: similar to ISD::VSELECT but not all bits within an
   // element must be identical.
-  BSL,
+  BSLP,
 
   // Vector arithmetic negation
   NEG,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1287,7 +1287,7 @@
   case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
   case AArch64ISD::BICi:              return "AArch64ISD::BICi";
   case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
-  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
+  case AArch64ISD::BSLP:              return "AArch64ISD::BSLP";
   case AArch64ISD::NEG:               return "AArch64ISD::NEG";
   case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
   case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
@@ -10228,7 +10228,7 @@
       }
 
       if (FoundMatch)
-        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+        return DAG.getNode(AArch64ISD::BSLP, DL, VT, SDValue(BVN0, 0),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
     }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5207,6 +5207,47 @@
   let Inst{4-0}   = Rd;
 }
 
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorPseudo<RegisterOperand regtype, list<dag> pattern>
+  : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>,
+    Sched<[WriteV]>;
+
+multiclass SIMDLogicalThreeVectorPseudo<SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorPseudo<V64,
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorPseudo<V128,
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
 // All operand sizes distinguished in the encoding.
 multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -468,7 +468,7 @@
 
 def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
 def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
-def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+def AArch64bslp: SDNode<"AArch64ISD::BSLP", SDT_AArch64trivec>;
 
 def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
 def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
@@ -3955,34 +3955,90 @@
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
                                   BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
-defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
-defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
-defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
-    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
 defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
 defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
                                   BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
 defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
 
-
-def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+// Pseudo bitwise bsl-like pattern BSLP. 
+// It is expanded into BSL/BIT/BIF after register allocation. 
+defm BSLP : SIMDLogicalThreeVectorPseudo<TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", AArch64bslp>;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif", AArch64bslp>;
+
+def : Pat<(AArch64bslp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bslp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bslp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bslp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bslp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bslp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bslp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bslp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(AArch64bslp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
           (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+def : Pat<(AArch64bslp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
           (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+def : Pat<(AArch64bslp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
           (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+def : Pat<(AArch64bslp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
           (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
 
-def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+def : Pat<(AArch64bslp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
           (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+def : Pat<(AArch64bslp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
           (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+def : Pat<(AArch64bslp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
           (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+def : Pat<(AArch64bslp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
           (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
+def : Pat<(AArch64bit (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bit (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bit (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bit (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BITv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bit (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bit (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bit (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bit (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BITv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(AArch64bslp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BIFv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bslp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BIFv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bslp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BIFv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bslp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BIFv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bslp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BIFv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bslp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BIFv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bslp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BIFv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bslp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BIFv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
 def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
                 (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
 def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; BIF Bitwise Insert if False
+;
+; 8-bit vectors tests
+
+define <1 x i8> @test_bitf_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) {
+; CHECK-LABEL: test_bitf_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i8> %C, <i8 -1>
+  %and = and <1 x i8> %neg, %B
+  %and1 = and <1 x i8> %C, %A
+  %or = or <1 x i8> %and, %and1
+  ret <1 x i8> %or
+}
+
+; 16-bit vectors tests
+
+define <1 x i16> @test_bitf_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) {
+; CHECK-LABEL: test_bitf_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i16> %C, <i16 -1>
+  %and = and <1 x i16> %neg, %B
+  %and1 = and <1 x i16> %C, %A
+  %or = or <1 x i16> %and, %and1
+  ret <1 x i16> %or
+}
+
+; 32-bit vectors tests
+
+define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
+; CHECK-LABEL: test_bitf_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i32> %C, <i32 -1>
+  %and = and <1 x i32> %neg, %B
+  %and1 = and <1 x i32> %C, %A
+  %or = or <1 x i32> %and, %and1
+  ret <1 x i32> %or
+}
+
+; 64-bit vectors tests
+
+define <1 x i64> @test_bitf_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) {
+; CHECK-LABEL: test_bitf_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <1 x i64> %C, <i64 -1>
+  %and = and <1 x i64> %neg, %B
+  %and1 = and <1 x i64> %C, %A
+  %or = or <1 x i64> %and, %and1
+  ret <1 x i64> %or
+}
+
+define <2 x i32> @test_bitf_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+; CHECK-LABEL: test_bitf_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <2 x i32> %C, <i32 -1, i32 -1>
+  %and = and <2 x i32> %neg, %B
+  %and1 = and <2 x i32> %C, %A
+  %or = or <2 x i32> %and, %and1
+  ret <2 x i32> %or
+}
+
+define <4 x i16> @test_bitf_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
+; CHECK-LABEL: test_bitf_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <4 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %and = and <4 x i16> %neg, %B
+  %and1 = and <4 x i16> %C, %A
+  %or = or <4 x i16> %and, %and1
+  ret <4 x i16> %or
+}
+
+define <8 x i8> @test_bitf_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
+; CHECK-LABEL: test_bitf_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %neg = xor <8 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and = and <8 x i8> %neg, %B
+  %and1 = and <8 x i8> %C, %A
+  %or = or <8 x i8> %and, %and1
+  ret <8 x i8> %or
+}
+
+; 128-bit vectors tests
+
+define <2 x i64> @test_bitf_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: test_bitf_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <2 x i64> %C, <i64 -1, i64 -1>
+  %and = and <2 x i64> %neg, %B
+  %and1 = and <2 x i64> %C, %A
+  %or = or <2 x i64> %and, %and1
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @test_bitf_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_bitf_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and = and <4 x i32> %neg, %B
+  %and1 = and <4 x i32> %C, %A
+  %or = or <4 x i32> %and, %and1
+  ret <4 x i32> %or
+}
+
+define <8 x i16> @test_bitf_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: test_bitf_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %and = and <8 x i16> %neg, %B
+  %and1 = and <8 x i16> %C, %A
+  %or = or <8 x i16> %and, %and1
+  ret <8 x i16> %or
+}
+
+define <16 x i8> @test_bitf_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: test_bitf_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %neg = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and = and <16 x i8> %neg, %B
+  %and1 = and <16 x i8> %C, %A
+  %or = or <16 x i8> %and, %and1
+  ret <16 x i8> %or
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; BIT Bitwise Insert if True
+;
+; 8-bit vectors tests
+
+define <1 x i8> @test_bit_v1i8(<1 x i8> %A, <1 x i8> %B, <1 x i8> %C) {
+; CHECK-LABEL: test_bit_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i8> %C, %B
+  %neg = xor <1 x i8> %C, <i8 -1>
+  %and1 = and <1 x i8> %neg, %A
+  %or = or <1 x i8> %and, %and1
+  ret <1 x i8> %or
+}
+
+; 16-bit vectors tests
+
+define <1 x i16> @test_bit_v1i16(<1 x i16> %A, <1 x i16> %B, <1 x i16> %C) {
+; CHECK-LABEL: test_bit_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i16> %C, %B
+  %neg = xor <1 x i16> %C, <i16 -1>
+  %and1 = and <1 x i16> %neg, %A
+  %or = or <1 x i16> %and, %and1
+  ret <1 x i16> %or
+}
+
+; 32-bit vectors tests
+
+define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
+; CHECK-LABEL: test_bit_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i32> %C, %B
+  %neg = xor <1 x i32> %C, <i32 -1>
+  %and1 = and <1 x i32> %neg, %A
+  %or = or <1 x i32> %and, %and1
+  ret <1 x i32> %or
+}
+
+; 64-bit vectors tests
+
+define <1 x i64> @test_bit_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C) {
+; CHECK-LABEL: test_bit_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <1 x i64> %C, %B
+  %neg = xor <1 x i64> %C, <i64 -1>
+  %and1 = and <1 x i64> %neg, %A
+  %or = or <1 x i64> %and, %and1
+  ret <1 x i64> %or
+}
+
+define <2 x i32> @test_bit_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+; CHECK-LABEL: test_bit_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <2 x i32> %C, %B
+  %neg = xor <2 x i32> %C, <i32 -1, i32 -1>
+  %and1 = and <2 x i32> %neg, %A
+  %or = or <2 x i32> %and, %and1
+  ret <2 x i32> %or
+}
+
+define <4 x i16> @test_bit_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
+; CHECK-LABEL: test_bit_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <4 x i16> %C, %B
+  %neg = xor <4 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %and1 = and <4 x i16> %neg, %A
+  %or = or <4 x i16> %and, %and1
+  ret <4 x i16> %or
+}
+
+define <8 x i8> @test_bit_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
+; CHECK-LABEL: test_bit_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %and = and <8 x i8> %C, %B
+  %neg = xor <8 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and1 = and <8 x i8> %neg, %A
+  %or = or <8 x i8> %and, %and1
+  ret <8 x i8> %or
+}
+
+; 128-bit vectors tests
+
+define <2 x i64> @test_bit_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C) {
+; CHECK-LABEL: test_bit_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <2 x i64> %C, %B
+  %neg = xor <2 x i64> %C, <i64 -1, i64 -1>
+  %and1 = and <2 x i64> %neg, %A
+  %or = or <2 x i64> %and, %and1
+  ret <2 x i64> %or
+}
+
+define <4 x i32> @test_bit_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_bit_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <4 x i32> %C, %B
+  %neg = xor <4 x i32> %C, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %and1 = and <4 x i32> %neg, %A
+  %or = or <4 x i32> %and, %and1
+  ret <4 x i32> %or
+}
+
+define <8 x i16> @test_bit_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
+; CHECK-LABEL: test_bit_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <8 x i16> %C, %B
+  %neg = xor <8 x i16> %C, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %and1 = and <8 x i16> %neg, %A
+  %or = or <8 x i16> %and, %and1
+  ret <8 x i16> %or
+}
+
+define <16 x i8> @test_bit_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK-LABEL: test_bit_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %and = and <16 x i8> %C, %B
+  %neg = xor <16 x i8> %C, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and1 = and <16 x i8> %neg, %A
+  %or = or <16 x i8> %and, %and1
+  ret <16 x i8> %or
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
--- a/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
@@ -9,8 +9,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.8b, v3.8b, v2.8b
 ; CHECK-NEXT:    dup v2.8b, v2.b[0]
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i8 %a, %b
   %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
@@ -49,8 +48,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    dup v2.16b, v2.b[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i8 %a, %b
   %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
@@ -92,8 +90,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.4h, v3.4h, v2.4h
 ; CHECK-NEXT:    dup v2.4h, v2.h[0]
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i16 %a, %b
   %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
@@ -107,8 +104,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.8h, v3.8h, v2.8h
 ; CHECK-NEXT:    dup v2.8h, v2.h[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i16 %a, %b
   %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
@@ -122,8 +118,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.2s, v3.2s, v2.2s
 ; CHECK-NEXT:    dup v2.2s, v2.s[0]
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
@@ -137,8 +132,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
@@ -151,8 +145,7 @@
 ; CHECK-NEXT:    fmov d2, x1
 ; CHECK-NEXT:    fmov d3, x0
 ; CHECK-NEXT:    cmeq d2, d3, d2
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
@@ -166,8 +159,7 @@
 ; CHECK-NEXT:    fmov d3, x0
 ; CHECK-NEXT:    cmeq v2.2d, v3.2d, v2.2d
 ; CHECK-NEXT:    dup v2.2d, v2.d[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
@@ -222,8 +214,7 @@
 ; CHECK-NEXT:    fmov s3, w0
 ; CHECK-NEXT:    cmeq v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i32 %a, %b
   %e = select i1 %cmp31, <4x float> %c, <4x float> %d
@@ -247,8 +238,7 @@
 ; CHECK-NEXT:    fmov d2, x1
 ; CHECK-NEXT:    fmov d3, x0
 ; CHECK-NEXT:    cmeq d2, d3, d2
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp31 = icmp eq i64 %a, %b
   %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
@@ -278,8 +268,7 @@
 ; CHECK-NEXT:    tst w0, #0x1
 ; CHECK-NEXT:    csetm w8, ne
 ; CHECK-NEXT:    dup v2.2s, w8
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %cmp = icmp ne i1 %cc, 0
   %e = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
@@ -294,8 +283,7 @@
 ; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
 ; CHECK-NEXT:    fcmeq v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    dup v2.4s, v2.s[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cc = fcmp oeq float %c1, %c2
   %r = select i1 %cc, <3 x float> %a, <3 x float> %b
@@ -309,8 +297,7 @@
 ; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
 ; CHECK-NEXT:    fcmeq v2.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    dup v2.2d, v2.d[0]
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %cc = fcmp oeq double %c1, %c2
   %r = select i1 %cc, <3 x float> %a, <3 x float> %b
diff --git a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
--- a/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/fp16-vector-shuffle.ll
@@ -5,8 +5,7 @@
 define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 {
 ; CHECK-LABEL: select_64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
 entry:
   %0 = bitcast <4 x half> %a to <4 x i16>
@@ -23,8 +22,7 @@
 define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 {
 ; CHECK-LABEL: select_128:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
 entry:
   %0 = bitcast <8 x half> %a to <8 x i16>
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -480,9 +480,9 @@
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    mov w9, #42
 ; CHECK-NEXT:    cmhi v2.2d, v1.2d, v0.2d
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    dup v0.2d, x9
-; CHECK-NEXT:    add v0.2d, v2.2d, v0.2d
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    dup v1.2d, x9
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %c = icmp ult <2 x i64> %x, <i64 -43, i64 -43>
   %s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> <i64 -43, i64 -43>
@@ -653,8 +653,8 @@
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v2.16b, v1.16b
 ; CHECK-NEXT:    cmhi v3.2d, v2.2d, v0.2d
-; CHECK-NEXT:    bsl v3.16b, v0.16b, v2.16b
-; CHECK-NEXT:    add v0.2d, v3.2d, v1.2d
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
   %noty = xor <2 x i64> %y, <i64 -1, i64 -1>
   %c = icmp ult <2 x i64> %x, %noty
diff --git a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
--- a/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/sqrt-fastmath.ll
@@ -71,10 +71,9 @@
 ; CHECK-NEXT:    fmul v2.2s, v1.2s, v1.2s
 ; CHECK-NEXT:    frsqrts v2.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    fmul v2.2s, v2.2s, v0.2s
-; CHECK-NEXT:    fmul v2.2s, v1.2s, v2.2s
-; CHECK-NEXT:    fcmeq v1.2s, v0.2s, #0.0
-; CHECK-NEXT:    bsl v1.8b, v0.8b, v2.8b
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v1.2s, v1.2s, v2.2s
+; CHECK-NEXT:    fcmeq v2.2s, v0.2s, #0.0
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> %a)
   ret <2 x float> %1
@@ -95,10 +94,9 @@
 ; CHECK-NEXT:    fmul v2.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    frsqrts v2.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    fmul v2.4s, v2.4s, v0.4s
-; CHECK-NEXT:    fmul v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fcmeq v1.4s, v0.4s, #0.0
-; CHECK-NEXT:    bsl v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fcmeq v2.4s, v0.4s, #0.0
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %a)
   ret <4 x float> %1
@@ -120,21 +118,19 @@
 ; CHECK-NEXT:    fmul v3.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    frsqrts v3.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    fmul v3.4s, v3.4s, v0.4s
-; CHECK-NEXT:    fmul v3.4s, v2.4s, v3.4s
-; CHECK-NEXT:    fcmeq v2.4s, v0.4s, #0.0
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v3.16b
-; CHECK-NEXT:    frsqrte v0.4s, v1.4s
-; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, #0.0
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    frsqrte v2.4s, v1.4s
+; CHECK-NEXT:    fmul v3.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    frsqrts v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fmul v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v3.4s
+; CHECK-NEXT:    fmul v3.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    frsqrts v3.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    fmul v3.4s, v3.4s, v1.4s
-; CHECK-NEXT:    fmul v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fmul v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    fcmeq v3.4s, v1.4s, #0.0
-; CHECK-NEXT:    bsl v3.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <8 x float> @llvm.sqrt.v8f32(<8 x float> %a)
   ret <8 x float> %1
@@ -210,10 +206,9 @@
 ; CHECK-NEXT:    fmul v2.2d, v1.2d, v1.2d
 ; CHECK-NEXT:    frsqrts v2.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    fmul v2.2d, v2.2d, v0.2d
-; CHECK-NEXT:    fmul v2.2d, v1.2d, v2.2d
-; CHECK-NEXT:    fcmeq v1.2d, v0.2d, #0.0
-; CHECK-NEXT:    bsl v1.16b, v0.16b, v2.16b
-; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    fmul v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    fcmeq v2.2d, v0.2d, #0.0
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %a)
   ret <2 x double> %1
@@ -238,24 +233,22 @@
 ; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v0.2d, v3.2d
 ; CHECK-NEXT:    fmul v3.2d, v3.2d, v0.2d
-; CHECK-NEXT:    fmul v3.2d, v2.2d, v3.2d
-; CHECK-NEXT:    fcmeq v2.2d, v0.2d, #0.0
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v3.16b
-; CHECK-NEXT:    frsqrte v0.2d, v1.2d
-; CHECK-NEXT:    fmul v3.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fcmeq v3.2d, v0.2d, #0.0
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
+; CHECK-NEXT:    frsqrte v2.2d, v1.2d
+; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v1.2d, v3.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    fmul v3.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v1.2d, v3.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v3.2d
-; CHECK-NEXT:    fmul v3.2d, v0.2d, v0.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
+; CHECK-NEXT:    fmul v3.2d, v2.2d, v2.2d
 ; CHECK-NEXT:    frsqrts v3.2d, v1.2d, v3.2d
 ; CHECK-NEXT:    fmul v3.2d, v3.2d, v1.2d
-; CHECK-NEXT:    fmul v0.2d, v0.2d, v3.2d
+; CHECK-NEXT:    fmul v2.2d, v2.2d, v3.2d
 ; CHECK-NEXT:    fcmeq v3.2d, v1.2d, #0.0
-; CHECK-NEXT:    bsl v3.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    bif v1.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    ret
   %1 = tail call fast <4 x double> @llvm.sqrt.v4f64(<4 x double> %a)
   ret <4 x double> %1
diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
@@ -62,8 +62,7 @@
 ; CHECK-LABEL: out_constant_varx_42:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %mask, %x
@@ -76,8 +75,7 @@
 ; CHECK-LABEL: in_constant_varx_42:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
   %n1 = and <4 x i32> %n0, %mask
@@ -90,8 +88,7 @@
 ; CHECK-LABEL: out_constant_varx_42_invmask:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %notmask, %x
@@ -105,8 +102,7 @@
 ; CHECK-LABEL: in_constant_varx_42_invmask:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.4s, #42
-; CHECK-NEXT:    bsl v2.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
@@ -169,9 +165,8 @@
 define <4 x i32> @out_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: out_constant_42_vary:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %mask, <i32 42, i32 42, i32 42, i32 42>
@@ -183,9 +178,8 @@
 define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: in_constant_42_vary:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
   %n1 = and <4 x i32> %n0, %mask
@@ -197,9 +191,8 @@
 define <4 x i32> @out_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: out_constant_42_vary_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %mx = and <4 x i32> %notmask, <i32 42, i32 42, i32 42, i32 42>
@@ -212,9 +205,8 @@
 define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
 ; CHECK-LABEL: in_constant_42_vary_invmask:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov v0.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #42
-; CHECK-NEXT:    bsl v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    movi v0.4s, #42
+; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
   %n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
diff --git a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
--- a/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
@@ -13,8 +13,7 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i8> %x, %mask
   %notmask = xor <1 x i8> %mask, <i8 -1>
@@ -46,8 +45,7 @@
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i16> %x, %mask
   %notmask = xor <1 x i16> %mask, <i16 -1>
@@ -111,8 +109,7 @@
 define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i32> %x, %mask
   %notmask = xor <1 x i32> %mask, <i32 -1>
@@ -128,8 +125,7 @@
 define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <8 x i8> %x, %mask
   %notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -141,8 +137,7 @@
 define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i16> %x, %mask
   %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
@@ -154,8 +149,7 @@
 define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v4i16_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i16> %x, %mask
   %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
@@ -167,8 +161,7 @@
 define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <2 x i32> %x, %mask
   %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
@@ -180,8 +173,7 @@
 define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %mx = and <1 x i64> %x, %mask
   %notmask = xor <1 x i64> %mask, <i64 -1>
@@ -197,8 +189,7 @@
 define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <16 x i8> %x, %mask
   %notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -210,8 +201,7 @@
 define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <8 x i16> %x, %mask
   %notmask = xor <8 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -223,8 +213,7 @@
 define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -236,8 +225,7 @@
 define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32_undef:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
@@ -249,8 +237,7 @@
 define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %mx = and <2 x i64> %x, %mask
   %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
@@ -270,8 +257,7 @@
 define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v1i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i8> %x, %y
   %n1 = and <1 x i8> %n0, %mask
@@ -286,8 +272,7 @@
 define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i8> %x, %y
   %n1 = and <2 x i8> %n0, %mask
@@ -298,8 +283,7 @@
 define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v1i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i16> %x, %y
   %n1 = and <1 x i16> %n0, %mask
@@ -314,8 +298,7 @@
 define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i8> %x, %y
   %n1 = and <4 x i8> %n0, %mask
@@ -326,8 +309,7 @@
 define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i16> %x, %y
   %n1 = and <2 x i16> %n0, %mask
@@ -338,8 +320,7 @@
 define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v1i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i32> %x, %y
   %n1 = and <1 x i32> %n0, %mask
@@ -354,8 +335,7 @@
 define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <8 x i8> %x, %y
   %n1 = and <8 x i8> %n0, %mask
@@ -366,8 +346,7 @@
 define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i16> %x, %y
   %n1 = and <4 x i16> %n0, %mask
@@ -378,8 +357,7 @@
 define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i32> %x, %y
   %n1 = and <2 x i32> %n0, %mask
@@ -390,8 +368,7 @@
 define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: in_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    ret
   %n0 = xor <1 x i64> %x, %y
   %n1 = and <1 x i64> %n0, %mask
@@ -406,8 +383,7 @@
 define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <16 x i8> %x, %y
   %n1 = and <16 x i8> %n0, %mask
@@ -418,8 +394,7 @@
 define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <8 x i16> %x, %y
   %n1 = and <8 x i16> %n0, %mask
@@ -430,8 +405,7 @@
 define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: in_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <4 x i32> %x, %y
   %n1 = and <4 x i32> %n0, %mask
@@ -442,8 +416,7 @@
 define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: in_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    bsl v2.16b, v0.16b, v1.16b
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    bif v0.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    ret
   %n0 = xor <2 x i64> %x, %y
   %n1 = and <2 x i64> %n0, %mask