diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -88,6 +88,7 @@
   GREVIW,
   GORCI,
   GORCIW,
+  SHFLI,
   // Vector Extension
   // VMV_V_X_VL matches the semantics of vmv.v.x but includes an extra operand
   // for the VL value to be used for the operation.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2353,6 +2353,20 @@
     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
     break;
   }
+  case RISCVISD::SHFLI: {
+    // There is no SHFLIW instruction, but we can just promote the operation.
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+    SDLoc DL(N);
+    SDValue NewOp0 =
+        DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
+    SDValue NewRes =
+        DAG.getNode(RISCVISD::SHFLI, DL, MVT::i64, NewOp0, N->getOperand(1));
+    // ReplaceNodeResults requires we maintain the same type for the return
+    // value.
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewRes));
+    break;
+  }
   case ISD::BSWAP:
   case ISD::BITREVERSE: {
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
@@ -2487,7 +2501,8 @@
 //   [8]  = 0x00FF00FF / 0xFF00FF00
 //   [16] = 0x0000FFFF / 0xFFFFFFFF
 //   [32] = 0x00000000FFFFFFFF / 0xFFFFFFFF00000000 (for RV64)
-static Optional<RISCVBitmanipPat> matchRISCVBitmanipPat(SDValue Op) {
+static Optional<RISCVBitmanipPat>
+matchRISCVBitmanipPat(SDValue Op, ArrayRef<uint64_t> BitmanipMasks) {
   Optional<uint64_t> Mask;
   // Optionally consume a mask around the shift operation.
   if (Op.getOpcode() == ISD::AND && isa<ConstantSDNode>(Op.getOperand(1))) {
@@ -2500,26 +2515,17 @@
 
   if (!isa<ConstantSDNode>(Op.getOperand(1)))
     return None;
-  auto ShAmt = Op.getConstantOperandVal(1);
+  uint64_t ShAmt = Op.getConstantOperandVal(1);
 
-  if (!isPowerOf2_64(ShAmt))
+  unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32;
+  if (ShAmt >= Width && !isPowerOf2_64(ShAmt))
     return None;
-
-  // These are the unshifted masks which we use to match bit-manipulation
-  // patterns. They may be shifted left in certain circumstances.
-  static const uint64_t BitmanipMasks[] = {
-      0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
-      0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL,
-  };
-
-  unsigned MaskIdx = Log2_64(ShAmt);
-  if (MaskIdx >= array_lengthof(BitmanipMasks))
+  // If we don't have enough masks for 64 bit, then we must be trying to
+  // match SHFL so we're only allowed to shift 1/4 of the width.
+  if (BitmanipMasks.size() == 5 && ShAmt >= (Width / 2))
     return None;
 
-  auto Src = Op.getOperand(0);
-
-  unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32;
-  auto ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+  SDValue Src = Op.getOperand(0);
 
   // The expected mask is shifted left when the AND is found around SHL
   // patterns.
@@ -2546,6 +2552,9 @@
     }
   }
 
+  unsigned MaskIdx = Log2_32(ShAmt);
+  uint64_t ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+
   if (SHLExpMask)
     ExpMask <<= ShAmt;
 
@@ -2555,15 +2564,27 @@
   return RISCVBitmanipPat{Src, (unsigned)ShAmt, IsSHL};
 }
 
+static Optional<RISCVBitmanipPat> matchGREVIPat(SDValue Op) {
+  // These are the unshifted masks which we use to match bit-manipulation
+  // patterns. They may be shifted left in certain circumstances.
+  static const uint64_t BitmanipMasks[] = {
+      0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
+      0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL,
+  };
+
+  return matchRISCVBitmanipPat(Op, BitmanipMasks);
+}
+
 // Match the following pattern as a GREVI(W) operation
 //   (or (BITMANIP_SHL x), (BITMANIP_SRL x))
 static SDValue combineORToGREV(SDValue Op, SelectionDAG &DAG,
                                const RISCVSubtarget &Subtarget) {
+  assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson");
   EVT VT = Op.getValueType();
 
   if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) {
-    auto LHS = matchRISCVBitmanipPat(Op.getOperand(0));
-    auto RHS = matchRISCVBitmanipPat(Op.getOperand(1));
+    auto LHS = matchGREVIPat(Op.getOperand(0));
+    auto RHS = matchGREVIPat(Op.getOperand(1));
     if (LHS && RHS && LHS->formsPairWith(*RHS)) {
       SDLoc DL(Op);
       return DAG.getNode(
@@ -2585,6 +2606,7 @@
 // 4.  (or (rotl/rotr x, bitwidth/2), x)
 static SDValue combineORToGORC(SDValue Op, SelectionDAG &DAG,
                                const RISCVSubtarget &Subtarget) {
+  assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson");
   EVT VT = Op.getValueType();
 
   if (VT == Subtarget.getXLenVT() || (Subtarget.is64Bit() && VT == MVT::i32)) {
@@ -2623,14 +2645,14 @@
       return SDValue();
     SDValue OrOp0 = Op0.getOperand(0);
     SDValue OrOp1 = Op0.getOperand(1);
-    auto LHS = matchRISCVBitmanipPat(OrOp0);
+    auto LHS = matchGREVIPat(OrOp0);
     // OR is commutable so swap the operands and try again: x might have been
     // on the left
     if (!LHS) {
       std::swap(OrOp0, OrOp1);
-      LHS = matchRISCVBitmanipPat(OrOp0);
+      LHS = matchGREVIPat(OrOp0);
     }
-    auto RHS = matchRISCVBitmanipPat(Op1);
+    auto RHS = matchGREVIPat(Op1);
     if (LHS && RHS && LHS->formsPairWith(*RHS) && LHS->Op == OrOp1) {
       return DAG.getNode(
           RISCVISD::GORCI, DL, VT, LHS->Op,
@@ -2640,6 +2662,93 @@
   return SDValue();
 }
 
+
+static Optional<RISCVBitmanipPat> matchSHFLPat(SDValue Op) {
+  // These are the unshifted masks which we use to match bit-manipulation
+  // patterns. They may be shifted left in certain circumstances.
+  static const uint64_t BitmanipMasks[] = {
+      0x2222222222222222ULL, 0x0C0C0C0C0C0C0C0CULL, 0x00F000F000F000F0ULL,
+      0x0000FF000000FF00ULL, 0x00000000FFFF0000ULL
+  };
+
+  return matchRISCVBitmanipPat(Op, BitmanipMasks);
+}
+
+// Match (or (or (SHFL_SHL x), (SHFL_SHR x)), (SHFL_AND x)
+static SDValue combineORToSHFL(SDValue Op, SelectionDAG &DAG,
+                               const RISCVSubtarget &Subtarget) {
+  assert(Subtarget.hasStdExtZbp() && "Expected Zbp extenson");
+  EVT VT = Op.getValueType();
+
+  if (VT != MVT::i32 && VT != Subtarget.getXLenVT())
+    return SDValue();
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+
+  // Or is commutable so canonicalize the second OR to the LHS.
+  if (Op0.getOpcode() != ISD::OR)
+    std::swap(Op0, Op1);
+  if (Op0.getOpcode() != ISD::OR)
+    return SDValue();
+
+  // We found an inner OR, so our operands are the operands of the inner OR
+  // and the other operand of the outer OR.
+  SDValue A = Op0.getOperand(0);
+  SDValue B = Op0.getOperand(1);
+  SDValue C = Op1;
+
+  auto Match1 = matchSHFLPat(A);
+  auto Match2 = matchSHFLPat(B);
+
+  // If neither matched, we failed.
+  if (!Match1 && !Match2)
+    return SDValue();
+
+  // We had at least one match. if one failed, try the remaining C operand.
+  if (!Match1) {
+    std::swap(A, C);
+    Match1 = matchSHFLPat(A);
+    if (!Match1)
+      return SDValue();
+  } else if (!Match2) {
+    std::swap(B, C);
+    Match2 = matchSHFLPat(B);
+    if (!Match2)
+      return SDValue();
+  }
+  assert(Match1 && Match2);
+
+  // Make sure our matches pair up.
+  if (!Match1->formsPairWith(*Match2))
+    return SDValue();
+
+  // All the remains is to make sure C is an AND with the same input, that masks
+  // out the bits that are being shuffled.
+  if (C.getOpcode() != ISD::AND || !isa<ConstantSDNode>(C.getOperand(1)) ||
+      C.getOperand(0) != Match1->Op)
+    return SDValue();
+
+  uint64_t Mask = C.getConstantOperandVal(1);
+
+  static const uint64_t BitmanipMasks[] = {
+      0x9999999999999999ULL, 0xC3C3C3C3C3C3C3C3ULL, 0xF00FF00FF00FF00FULL,
+      0xFF0000FFFF0000FFULL, 0xFFFF00000000FFFFULL,
+  };
+
+  unsigned Width = Op.getValueType() == MVT::i64 ? 64 : 32;
+  unsigned MaskIdx = Log2_32(Match1->ShAmt);
+  uint64_t ExpMask = BitmanipMasks[MaskIdx] & maskTrailingOnes<uint64_t>(Width);
+
+  if (Mask != ExpMask)
+    return SDValue();
+
+  SDLoc DL(Op);
+  return DAG.getNode(
+      RISCVISD::SHFLI, DL, VT, Match1->Op,
+      DAG.getTargetConstant(Match1->ShAmt, DL, Subtarget.getXLenVT()));
+}
+
 // Combine (GREVI (GREVI x, C2), C1) -> (GREVI x, C1^C2) when C1^C2 is
 // non-zero, and to x when it is. Any repeated GREVI stage undoes itself.
 // Combine (GORCI (GORCI x, C2), C1) -> (GORCI x, C1|C2). Repeated stage does
@@ -2819,6 +2928,8 @@
       return GREV;
     if (auto GORC = combineORToGORC(SDValue(N, 0), DCI.DAG, Subtarget))
       return GORC;
+    if (auto SHFL = combineORToSHFL(SDValue(N, 0), DCI.DAG, Subtarget))
+      return SHFL;
     break;
   case RISCVISD::SELECT_CC: {
     // Transform
@@ -3042,6 +3153,19 @@
     // more precise answer could be calculated for SRAW depending on known
     // bits in the shift amount.
     return 33;
+  case RISCVISD::SHFLI: {
+    // There is no SHFLIW, but a i64 SHFLI with bit 4 of the control word
+    // cleared doesn't affect bit 31. The upper 32 bits will be shuffled, but
+    // will stay within the upper 32 bits. If there were more than 32 sign bits
+    // before there will be at least 33 sign bits after.
+    if (Op.getValueType() == MVT::i64 &&
+        (Op.getConstantOperandVal(1) & 0x10) == 0) {
+      unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+      if (Tmp > 32)
+        return 33;
+    }
+    break;
+  }
   case RISCVISD::VMV_X_S:
     // The number of sign bits of the scalar result is computed by obtaining the
     // element type of the input vector operand, subtracting its width from the
@@ -4705,6 +4829,7 @@
   NODE_NAME_CASE(GREVIW)
   NODE_NAME_CASE(GORCI)
   NODE_NAME_CASE(GORCIW)
+  NODE_NAME_CASE(SHFLI)
   NODE_NAME_CASE(VMV_V_X_VL)
   NODE_NAME_CASE(VFMV_V_F_VL)
   NODE_NAME_CASE(VMV_X_S)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -671,8 +671,10 @@
 def riscv_greviw   : SDNode<"RISCVISD::GREVIW", SDTIntBinOp, []>;
 def riscv_gorci    : SDNode<"RISCVISD::GORCI", SDTIntBinOp, []>;
 def riscv_gorciw   : SDNode<"RISCVISD::GORCIW", SDTIntBinOp, []>;
+def riscv_shfli    : SDNode<"RISCVISD::SHFLI", SDTIntBinOp, []>;
 
 let Predicates = [HasStdExtZbp] in {
+def : Pat<(riscv_shfli GPR:$rs1, timm:$shamt), (SHFLI GPR:$rs1, timm:$shamt)>;
 def : Pat<(riscv_grevi GPR:$rs1, timm:$shamt), (GREVI GPR:$rs1, timm:$shamt)>;
 def : Pat<(riscv_gorci GPR:$rs1, timm:$shamt), (GORCI GPR:$rs1, timm:$shamt)>;
 
@@ -788,48 +790,6 @@
 def : Pat<(and GPR:$rs, 0x000000000000FFFF), (ZEXTH_RV64 GPR:$rs)>;
 }
 
-let Predicates = [HasStdExtZbp, IsRV32] in {
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 8)), (i32 0x00FF0000)),
-                  (and GPR:$rs1, (i32 0xFF0000FF))),
-              (and (srl GPR:$rs1, (i32 8)), (i32 0x0000FF00))),
-          (SHFLI GPR:$rs1, (i32 8))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 4)), (i32 0x0F000F00)),
-                  (and GPR:$rs1, (i32 0xF00FF00F))),
-              (and (srl GPR:$rs1, (i32 4)), (i32 0x00F000F0))),
-          (SHFLI GPR:$rs1, (i32 4))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 2)), (i32 0x30303030)),
-                  (and GPR:$rs1, (i32 0xC3C3C3C3))),
-              (and (srl GPR:$rs1, (i32 2)), (i32 0x0C0C0C0C))),
-          (SHFLI GPR:$rs1, (i32 2))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i32 1)), (i32 0x44444444)),
-                  (and GPR:$rs1, (i32 0x99999999))),
-              (and (srl GPR:$rs1, (i32 1)), (i32 0x22222222))),
-          (SHFLI GPR:$rs1, (i32 1))>;
-} // Predicates = [HasStdExtZbp, IsRV32]
-
-let Predicates = [HasStdExtZbp, IsRV64] in {
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 16)), (i64 0x0000FFFF00000000)),
-                  (and GPR:$rs1, (i64 0xFFFF00000000FFFF))),
-              (and (srl GPR:$rs1, (i64 16)), (i64 0x00000000FFFF0000))),
-          (SHFLI GPR:$rs1, (i64 16))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 8)), (i64 0x00FF000000FF0000)),
-                  (and GPR:$rs1, (i64 0xFF0000FFFF0000FF))),
-              (and (srl GPR:$rs1, (i64 8)), (i64 0x0000FF000000FF00))),
-          (SHFLI GPR:$rs1, (i64 8))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 4)), (i64 0x0F000F000F000F00)),
-                  (and GPR:$rs1, (i64 0xF00FF00FF00FF00F))),
-              (and (srl GPR:$rs1, (i64 4)), (i64 0x00F000F000F000F0))),
-          (SHFLI GPR:$rs1, (i64 4))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 2)), (i64 0x3030303030303030)),
-                  (and GPR:$rs1, (i64 0xC3C3C3C3C3C3C3C3))),
-              (and (srl GPR:$rs1, (i64 2)), (i64 0x0C0C0C0C0C0C0C0C))),
-          (SHFLI GPR:$rs1, (i64 2))>;
-def : Pat<(or (or (and (shl GPR:$rs1, (i64 1)), (i64 0x4444444444444444)),
-                  (and GPR:$rs1, (i64 0x9999999999999999))),
-              (and (srl GPR:$rs1, (i64 1)), (i64 0x2222222222222222))),
-          (SHFLI GPR:$rs1, (i64 1))>;
-} // Predicates = [HasStdExtZbp, IsRV64]
-
 let Predicates = [HasStdExtZba] in {
 def : Pat<(add (shl GPR:$rs1, (XLenVT 1)), GPR:$rs2),
           (SH1ADD GPR:$rs1, GPR:$rs2)>;
diff --git a/llvm/test/CodeGen/RISCV/rv32Zbp.ll b/llvm/test/CodeGen/RISCV/rv32Zbp.ll
--- a/llvm/test/CodeGen/RISCV/rv32Zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv32Zbp.ll
@@ -2850,8 +2850,8 @@
 ; RV32I-NEXT:    addi a3, a3, 1092
 ; RV32I-NEXT:    and a5, a5, a3
 ; RV32I-NEXT:    and a3, a4, a3
-; RV32I-NEXT:    or a2, a3, a2
-; RV32I-NEXT:    or a3, a5, a6
+; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    or a3, a6, a5
 ; RV32I-NEXT:    srli a0, a0, 1
 ; RV32I-NEXT:    srli a1, a1, 1
 ; RV32I-NEXT:    lui a4, 139810
@@ -2876,7 +2876,7 @@
   %and = and i64 %a, -7378697629483820647
   %shl = shl i64 %a, 1
   %and1 = and i64 %shl, 4919131752989213764
-  %or = or i64 %and1, %and
+  %or = or i64 %and, %and1
   %shr = lshr i64 %a, 1
   %and2 = and i64 %shr, 2459565876494606882
   %or3 = or i64 %or, %and2
@@ -2898,7 +2898,7 @@
 ; RV32I-NEXT:    lui a2, 49345
 ; RV32I-NEXT:    addi a2, a2, -1012
 ; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    or a0, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32IB-LABEL: shfl2_i32:
@@ -2916,7 +2916,7 @@
   %or = or i32 %and1, %and
   %shr = lshr i32 %a, 2
   %and2 = and i32 %shr, 202116108
-  %or3 = or i32 %or, %and2
+  %or3 = or i32 %and2, %or
   ret i32 %or3
 }
 
@@ -2933,16 +2933,16 @@
 ; RV32I-NEXT:    addi a3, a3, 48
 ; RV32I-NEXT:    and a5, a5, a3
 ; RV32I-NEXT:    and a3, a4, a3
-; RV32I-NEXT:    or a2, a3, a2
-; RV32I-NEXT:    or a3, a5, a6
+; RV32I-NEXT:    or a2, a2, a3
+; RV32I-NEXT:    or a3, a6, a5
 ; RV32I-NEXT:    srli a0, a0, 2
 ; RV32I-NEXT:    srli a1, a1, 2
 ; RV32I-NEXT:    lui a4, 49345
 ; RV32I-NEXT:    addi a4, a4, -1012
 ; RV32I-NEXT:    and a1, a1, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a0, a0, a3
+; RV32I-NEXT:    or a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IB-LABEL: shfl2_i64:
@@ -2959,10 +2959,10 @@
   %and = and i64 %a, -4340410370284600381
   %shl = shl i64 %a, 2
   %and1 = and i64 %shl, 3472328296227680304
-  %or = or i64 %and1, %and
+  %or = or i64 %and, %and1
   %shr = lshr i64 %a, 2
   %and2 = and i64 %shr, 868082074056920076
-  %or3 = or i64 %or, %and2
+  %or3 = or i64 %and2, %or
   ret i64 %or3
 }
 
@@ -2976,12 +2976,12 @@
 ; RV32I-NEXT:    lui a3, 61441
 ; RV32I-NEXT:    addi a3, a3, -256
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    srli a0, a0, 4
-; RV32I-NEXT:    lui a2, 3840
-; RV32I-NEXT:    addi a2, a2, 240
-; RV32I-NEXT:    and a0, a0, a2
-; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    lui a3, 3840
+; RV32I-NEXT:    addi a3, a3, 240
+; RV32I-NEXT:    and a0, a0, a3
+; RV32I-NEXT:    or a0, a0, a1
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IB-LABEL: shfl4_i32:
@@ -2996,10 +2996,10 @@
   %and = and i32 %a, -267390961
   %shl = shl i32 %a, 4
   %and1 = and i32 %shl, 251662080
-  %or = or i32 %and1, %and
   %shr = lshr i32 %a, 4
   %and2 = and i32 %shr, 15728880
-  %or3 = or i32 %or, %and2
+  %or = or i32 %and2, %and
+  %or3 = or i32 %or, %and1
   ret i32 %or3
 }
 
@@ -3008,24 +3008,24 @@
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 983295
 ; RV32I-NEXT:    addi a2, a2, 15
-; RV32I-NEXT:    and a6, a0, a2
-; RV32I-NEXT:    and a2, a1, a2
+; RV32I-NEXT:    and a6, a1, a2
+; RV32I-NEXT:    and a2, a0, a2
 ; RV32I-NEXT:    slli a4, a1, 4
 ; RV32I-NEXT:    slli a5, a0, 4
 ; RV32I-NEXT:    lui a3, 61441
 ; RV32I-NEXT:    addi a3, a3, -256
 ; RV32I-NEXT:    and a5, a5, a3
 ; RV32I-NEXT:    and a3, a4, a3
-; RV32I-NEXT:    or a2, a3, a2
-; RV32I-NEXT:    or a3, a5, a6
-; RV32I-NEXT:    srli a0, a0, 4
 ; RV32I-NEXT:    srli a1, a1, 4
+; RV32I-NEXT:    srli a0, a0, 4
 ; RV32I-NEXT:    lui a4, 3840
 ; RV32I-NEXT:    addi a4, a4, 240
-; RV32I-NEXT:    and a1, a1, a4
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    or a1, a3, a1
+; RV32I-NEXT:    or a0, a5, a0
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a6
 ; RV32I-NEXT:    ret
 ;
 ; RV32IB-LABEL: shfl4_i64:
@@ -3042,10 +3042,10 @@
   %and = and i64 %a, -1148435428713435121
   %shl = shl i64 %a, 4
   %and1 = and i64 %shl, 1080880403494997760
-  %or = or i64 %and1, %and
   %shr = lshr i64 %a, 4
   %and2 = and i64 %shr, 67555025218437360
-  %or3 = or i64 %or, %and2
+  %or = or i64 %and1, %and2
+  %or3 = or i64 %or, %and
   ret i64 %or3
 }
 
@@ -3058,12 +3058,12 @@
 ; RV32I-NEXT:    slli a2, a0, 8
 ; RV32I-NEXT:    lui a3, 4080
 ; RV32I-NEXT:    and a2, a2, a3
-; RV32I-NEXT:    or a1, a2, a1
 ; RV32I-NEXT:    srli a0, a0, 8
-; RV32I-NEXT:    lui a2, 16
-; RV32I-NEXT:    addi a2, a2, -256
-; RV32I-NEXT:    and a0, a0, a2
+; RV32I-NEXT:    lui a3, 16
+; RV32I-NEXT:    addi a3, a3, -256
+; RV32I-NEXT:    and a0, a0, a3
 ; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    or a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV32IB-LABEL: shfl8_i32:
@@ -3078,10 +3078,10 @@
   %and = and i32 %a, -16776961
   %shl = shl i32 %a, 8
   %and1 = and i32 %shl, 16711680
-  %or = or i32 %and1, %and
   %shr = lshr i32 %a, 8
   %and2 = and i32 %shr, 65280
-  %or3 = or i32 %or, %and2
+  %or = or i32 %and, %and2
+  %or3 = or i32 %or, %and1
   ret i32 %or3
 }
 
@@ -3090,23 +3090,23 @@
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a2, 1044480
 ; RV32I-NEXT:    addi a2, a2, 255
-; RV32I-NEXT:    and a3, a0, a2
+; RV32I-NEXT:    and a6, a0, a2
 ; RV32I-NEXT:    and a2, a1, a2
-; RV32I-NEXT:    slli a4, a1, 8
-; RV32I-NEXT:    slli a5, a0, 8
-; RV32I-NEXT:    lui a6, 4080
-; RV32I-NEXT:    and a5, a5, a6
-; RV32I-NEXT:    and a4, a4, a6
-; RV32I-NEXT:    or a2, a4, a2
-; RV32I-NEXT:    or a3, a5, a3
-; RV32I-NEXT:    srli a0, a0, 8
+; RV32I-NEXT:    slli a4, a0, 8
+; RV32I-NEXT:    slli a5, a1, 8
+; RV32I-NEXT:    lui a3, 4080
+; RV32I-NEXT:    and a5, a5, a3
+; RV32I-NEXT:    and a3, a4, a3
 ; RV32I-NEXT:    srli a1, a1, 8
+; RV32I-NEXT:    srli a0, a0, 8
 ; RV32I-NEXT:    lui a4, 16
 ; RV32I-NEXT:    addi a4, a4, -256
-; RV32I-NEXT:    and a1, a1, a4
 ; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    and a1, a1, a4
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    or a0, a0, a6
 ; RV32I-NEXT:    or a0, a3, a0
-; RV32I-NEXT:    or a1, a2, a1
+; RV32I-NEXT:    or a1, a5, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV32IB-LABEL: shfl8_i64:
@@ -3123,10 +3123,10 @@
   %and = and i64 %a, -72056494543077121
   %shl = shl i64 %a, 8
   %and1 = and i64 %shl, 71776119077928960
-  %or = or i64 %and1, %and
   %shr = lshr i64 %a, 8
   %and2 = and i64 %shr, 280375465148160
-  %or3 = or i64 %or, %and2
+  %or = or i64 %and2, %and
+  %or3 = or i64 %and1, %or
   ret i64 %or3
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rv64Zbp.ll b/llvm/test/CodeGen/RISCV/rv64Zbp.ll
--- a/llvm/test/CodeGen/RISCV/rv64Zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64Zbp.ll
@@ -3430,8 +3430,42 @@
   ret i64 %2
 }
 
-; There's no [un]shfliw instruction as slliu.w occupies the encoding slot that
-; would be occupied by shfliw.
+define signext i32 @shfl1_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: shfl1_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 629146
+; RV64I-NEXT:    addiw a1, a1, -1639
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a2, a0, 1
+; RV64I-NEXT:    lui a3, 279620
+; RV64I-NEXT:    addiw a3, a3, 1092
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    lui a2, 139810
+; RV64I-NEXT:    addiw a2, a2, 546
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: shfl1_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    zip.n a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: shfl1_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    zip.n a0, a0
+; RV64IBP-NEXT:    ret
+  %and = and i32 %a, -1717986919
+  %shl = shl i32 %a, 1
+  %and1 = and i32 %shl, 1145324612
+  %or = or i32 %and1, %and
+  %shr = lshr i32 %a, 1
+  %and2 = and i32 %shr, 572662306
+  %or3 = or i32 %or, %and2
+  ret i32 %or3
+}
 
 define i64 @shfl1_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: shfl1_i64:
@@ -3455,7 +3489,7 @@
 ; RV64I-NEXT:    slli a4, a3, 14
 ; RV64I-NEXT:    addi a4, a4, 1092
 ; RV64I-NEXT:    and a2, a2, a4
-; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 1
 ; RV64I-NEXT:    slli a2, a3, 13
 ; RV64I-NEXT:    addi a2, a2, 546
@@ -3475,13 +3509,50 @@
   %and = and i64 %a, -7378697629483820647
   %shl = shl i64 %a, 1
   %and1 = and i64 %shl, 4919131752989213764
-  %or = or i64 %and1, %and
+  %or = or i64 %and, %and1
   %shr = lshr i64 %a, 1
   %and2 = and i64 %shr, 2459565876494606882
   %or3 = or i64 %or, %and2
   ret i64 %or3
 }
 
+define signext i32 @shfl2_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: shfl2_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 801852
+; RV64I-NEXT:    addiw a1, a1, 963
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a2, a0, 2
+; RV64I-NEXT:    lui a3, 197379
+; RV64I-NEXT:    addiw a3, a3, 48
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    lui a2, 49345
+; RV64I-NEXT:    addiw a2, a2, -1012
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: shfl2_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    zip2.b a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: shfl2_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    zip2.b a0, a0
+; RV64IBP-NEXT:    ret
+  %and = and i32 %a, -1010580541
+  %shl = shl i32 %a, 2
+  %and1 = and i32 %shl, 808464432
+  %or = or i32 %and1, %and
+  %shr = lshr i32 %a, 2
+  %and2 = and i32 %shr, 202116108
+  %or3 = or i32 %and2, %or
+  ret i32 %or3
+}
+
 define i64 @shfl2_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: shfl2_i64:
 ; RV64I:       # %bb.0:
@@ -3504,14 +3575,14 @@
 ; RV64I-NEXT:    slli a4, a4, 12
 ; RV64I-NEXT:    addi a4, a4, 48
 ; RV64I-NEXT:    and a2, a2, a4
-; RV64I-NEXT:    or a1, a2, a1
+; RV64I-NEXT:    or a1, a1, a2
 ; RV64I-NEXT:    srli a0, a0, 2
 ; RV64I-NEXT:    slli a2, a3, 14
 ; RV64I-NEXT:    addi a2, a2, 193
 ; RV64I-NEXT:    slli a2, a2, 12
 ; RV64I-NEXT:    addi a2, a2, -1012
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IB-LABEL: shfl2_i64:
@@ -3526,13 +3597,50 @@
   %and = and i64 %a, -4340410370284600381
   %shl = shl i64 %a, 2
   %and1 = and i64 %shl, 3472328296227680304
-  %or = or i64 %and1, %and
+  %or = or i64 %and, %and1
   %shr = lshr i64 %a, 2
   %and2 = and i64 %shr, 868082074056920076
-  %or3 = or i64 %or, %and2
+  %or3 = or i64 %and2, %or
   ret i64 %or3
 }
 
+define signext i32 @shfl4_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: shfl4_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 983295
+; RV64I-NEXT:    addiw a1, a1, 15
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a2, a0, 4
+; RV64I-NEXT:    lui a3, 61441
+; RV64I-NEXT:    addiw a3, a3, -256
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a0, a0, 4
+; RV64I-NEXT:    lui a3, 3840
+; RV64I-NEXT:    addiw a3, a3, 240
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: shfl4_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    zip4.h a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: shfl4_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    zip4.h a0, a0
+; RV64IBP-NEXT:    ret
+  %and = and i32 %a, -267390961
+  %shl = shl i32 %a, 4
+  %and1 = and i32 %shl, 251662080
+  %shr = lshr i32 %a, 4
+  %and2 = and i32 %shr, 15728880
+  %or = or i32 %and2, %and
+  %or3 = or i32 %or, %and1
+  ret i32 %or3
+}
+
 define i64 @shfl4_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: shfl4_i64:
 ; RV64I:       # %bb.0:
@@ -3555,12 +3663,12 @@
 ; RV64I-NEXT:    slli a4, a4, 12
 ; RV64I-NEXT:    addi a4, a4, -256
 ; RV64I-NEXT:    and a2, a2, a4
-; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    srli a0, a0, 4
-; RV64I-NEXT:    slli a2, a3, 20
-; RV64I-NEXT:    addi a2, a2, 240
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    slli a3, a3, 20
+; RV64I-NEXT:    addi a3, a3, 240
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IB-LABEL: shfl4_i64:
@@ -3575,13 +3683,49 @@
   %and = and i64 %a, -1148435428713435121
   %shl = shl i64 %a, 4
   %and1 = and i64 %shl, 1080880403494997760
-  %or = or i64 %and1, %and
   %shr = lshr i64 %a, 4
   %and2 = and i64 %shr, 67555025218437360
-  %or3 = or i64 %or, %and2
+  %or = or i64 %and1, %and2
+  %or3 = or i64 %or, %and
   ret i64 %or3
 }
 
+define signext i32 @shfl8_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: shfl8_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1044480
+; RV64I-NEXT:    addiw a1, a1, 255
+; RV64I-NEXT:    and a1, a0, a1
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    lui a3, 4080
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srli a0, a0, 8
+; RV64I-NEXT:    lui a3, 16
+; RV64I-NEXT:    addiw a3, a3, -256
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV64IB-LABEL: shfl8_i32:
+; RV64IB:       # %bb.0:
+; RV64IB-NEXT:    zip8.w a0, a0
+; RV64IB-NEXT:    ret
+;
+; RV64IBP-LABEL: shfl8_i32:
+; RV64IBP:       # %bb.0:
+; RV64IBP-NEXT:    zip8.w a0, a0
+; RV64IBP-NEXT:    ret
+  %and = and i32 %a, -16776961
+  %shl = shl i32 %a, 8
+  %and1 = and i32 %shl, 16711680
+  %shr = lshr i32 %a, 8
+  %and2 = and i32 %shr, 65280
+  %or = or i32 %and, %and2
+  %or3 = or i32 %or, %and1
+  ret i32 %or3
+}
+
 define i64 @shfl8_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: shfl8_i64:
 ; RV64I:       # %bb.0:
@@ -3598,14 +3742,14 @@
 ; RV64I-NEXT:    addi a4, a4, 255
 ; RV64I-NEXT:    slli a4, a4, 16
 ; RV64I-NEXT:    and a2, a2, a4
-; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    srli a0, a0, 8
-; RV64I-NEXT:    slli a2, a3, 24
-; RV64I-NEXT:    addi a2, a2, 1
-; RV64I-NEXT:    slli a2, a2, 16
-; RV64I-NEXT:    addi a2, a2, -256
-; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    slli a3, a3, 24
+; RV64I-NEXT:    addi a3, a3, 1
+; RV64I-NEXT:    slli a3, a3, 16
+; RV64I-NEXT:    addi a3, a3, -256
+; RV64I-NEXT:    and a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64IB-LABEL: shfl8_i64:
@@ -3620,10 +3764,10 @@
   %and = and i64 %a, -72056494543077121
   %shl = shl i64 %a, 8
   %and1 = and i64 %shl, 71776119077928960
-  %or = or i64 %and1, %and
   %shr = lshr i64 %a, 8
   %and2 = and i64 %shr, 280375465148160
-  %or3 = or i64 %or, %and2
+  %or = or i64 %and2, %and
+  %or3 = or i64 %and1, %or
   ret i64 %or3
 }