diff --git a/clang/lib/Basic/Targets/BPF.h b/clang/lib/Basic/Targets/BPF.h
--- a/clang/lib/Basic/Targets/BPF.h
+++ b/clang/lib/Basic/Targets/BPF.h
@@ -106,7 +106,7 @@
   void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
 
   bool setCPU(const std::string &Name) override {
-    if (Name == "v3") {
+    if (Name == "v3" || Name == "v4") {
       HasAlu32 = true;
     }
 
diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp
--- a/clang/lib/Basic/Targets/BPF.cpp
+++ b/clang/lib/Basic/Targets/BPF.cpp
@@ -32,7 +32,7 @@
 }
 
 static constexpr llvm::StringLiteral ValidCPUNames[] = {"generic", "v1", "v2",
-                                                        "v3", "probe"};
+                                                        "v3", "v4", "probe"};
 
 bool BPFTargetInfo::isValidCPUName(StringRef Name) const {
   return llvm::is_contained(ValidCPUNames, Name);
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -73,7 +73,7 @@
 
 // RUN: not %clang_cc1 -triple bpf--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix BPF
 // BPF: error: unknown target CPU 'not-a-cpu'
-// BPF-NEXT: note: valid target CPU values are: generic, v1, v2, v3, probe{{$}}
+// BPF-NEXT: note: valid target CPU values are: generic, v1, v2, v3, v4, probe{{$}}
 
 // RUN: not %clang_cc1 -triple avr--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AVR
 // AVR: error: unknown target CPU 'not-a-cpu'
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -227,6 +227,7 @@
         .Case("if", true)
         .Case("call", true)
         .Case("goto", true)
+        .Case("gotol", true)
         .Case("*", true)
         .Case("exit", true)
         .Case("lock", true)
@@ -241,13 +242,20 @@
         .Case("u32", true)
         .Case("u16", true)
         .Case("u8", true)
+        .Case("s32", true)
+        .Case("s16", true)
+        .Case("s8", true)
         .Case("be64", true)
         .Case("be32", true)
         .Case("be16", true)
         .Case("le64", true)
         .Case("le32", true)
         .Case("le16", true)
+        .Case("bswap16", true)
+        .Case("bswap32", true)
+        .Case("bswap64", true)
         .Case("goto", true)
+        .Case("gotol", true)
         .Case("ll", true)
         .Case("skb", true)
         .Case("s", true)
diff --git a/llvm/lib/Target/BPF/BPF.td b/llvm/lib/Target/BPF/BPF.td
--- a/llvm/lib/Target/BPF/BPF.td
+++ b/llvm/lib/Target/BPF/BPF.td
@@ -30,6 +30,7 @@
 def : Proc<"v1", []>;
 def : Proc<"v2", []>;
 def : Proc<"v3", [ALU32]>;
+def : Proc<"v4", [ALU32]>;
 def : Proc<"probe", []>;
 
 def BPFInstPrinter : AsmWriter {
@@ -45,7 +46,7 @@
   int Variant = 0;
   string Name = "BPF";
   string BreakCharacters = ".";
-  string TokenizingCharacters = "#()[]=:.<>!+*";
+  string TokenizingCharacters = "#()[]=:.<>!+*%/";
 }
 
 def BPF : Target {
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -192,15 +192,17 @@
   default:
     break;
   case ISD::SDIV: {
-    DebugLoc Empty;
-    const DebugLoc &DL = Node->getDebugLoc();
-    if (DL != Empty)
-      errs() << "Error at line " << DL.getLine() << ": ";
-    else
-      errs() << "Error: ";
-    errs() << "Unsupport signed division for DAG: ";
-    Node->print(errs(), CurDAG);
-    errs() << "Please convert to unsigned div/mod.\n";
+    if (!Subtarget->getCPUv4_sdiv()) {
+      DebugLoc Empty;
+      const DebugLoc &DL = Node->getDebugLoc();
+      if (DL != Empty)
+        errs() << "Error at line " << DL.getLine() << ": ";
+      else
+        errs() << "Error: ";
+      errs() << "Unsupport signed division for DAG: ";
+      Node->print(errs(), CurDAG);
+      errs() << "Please convert to unsigned div/mod.\n";
+    }
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -71,6 +71,7 @@
   bool HasAlu32;
   bool HasJmp32;
   bool HasJmpExt;
+  bool HasCPUv4_movsx;
 
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -102,7 +102,8 @@
 
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::UDIVREM, VT, Expand);
-    setOperationAction(ISD::SREM, VT, Expand);
+    if (!STI.getCPUv4_sdiv())
+      setOperationAction(ISD::SREM, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
@@ -141,9 +142,11 @@
     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+    if (!STI.getCPUv4_ldsx()) {
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
+    }
   }
 
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -183,6 +186,7 @@
   HasAlu32 = STI.getHasAlu32();
   HasJmp32 = STI.getHasJmp32();
   HasJmpExt = STI.getHasJmpExt();
+  HasCPUv4_movsx = STI.getCPUv4_movsx();
 }
 
 bool BPFTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
@@ -673,11 +677,15 @@
   Register PromotedReg0 = RegInfo.createVirtualRegister(RC);
   Register PromotedReg1 = RegInfo.createVirtualRegister(RC);
   Register PromotedReg2 = RegInfo.createVirtualRegister(RC);
-  BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg);
-  BuildMI(BB, DL, TII.get(BPF::SLL_ri), PromotedReg1)
-    .addReg(PromotedReg0).addImm(32);
-  BuildMI(BB, DL, TII.get(RShiftOp), PromotedReg2)
-    .addReg(PromotedReg1).addImm(32);
+  if (HasCPUv4_movsx) {
+    BuildMI(BB, DL, TII.get(BPF::MOVSX_rr_32), PromotedReg0).addReg(Reg);
+  } else {
+    BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg);
+    BuildMI(BB, DL, TII.get(BPF::SLL_ri), PromotedReg1)
+      .addReg(PromotedReg0).addImm(32);
+    BuildMI(BB, DL, TII.get(RShiftOp), PromotedReg2)
+      .addReg(PromotedReg1).addImm(32);
+  }
 
   return PromotedReg2;
 }
diff --git a/llvm/lib/Target/BPF/BPFInstrFormats.td b/llvm/lib/Target/BPF/BPFInstrFormats.td
--- a/llvm/lib/Target/BPF/BPFInstrFormats.td
+++ b/llvm/lib/Target/BPF/BPFInstrFormats.td
@@ -90,6 +90,7 @@
 def BPF_ABS  : BPFModeModifer<0x1>;
 def BPF_IND  : BPFModeModifer<0x2>;
 def BPF_MEM  : BPFModeModifer<0x3>;
+def BPF_MEMSX  : BPFModeModifer<0x4>;
 def BPF_ATOMIC : BPFModeModifer<0x6>;
 
 class BPFAtomicFlag<bits<4> val> {
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -53,6 +53,12 @@
 def BPFIsBigEndian    : Predicate<"!CurDAG->getDataLayout().isLittleEndian()">;
 def BPFHasALU32 : Predicate<"Subtarget->getHasAlu32()">;
 def BPFNoALU32 : Predicate<"!Subtarget->getHasAlu32()">;
+def BPFHasCPUv4_ldsx : Predicate<"Subtarget->getCPUv4_ldsx()">;
+def BPFHasCPUv4_movsx : Predicate<"Subtarget->getCPUv4_movsx()">;
+def BPFHasCPUv4_bswap : Predicate<"Subtarget->getCPUv4_bswap()">;
+def BPFHasCPUv4_sdiv : Predicate<"Subtarget->getCPUv4_sdiv()">;
+def BPFNoCPUv4_movsx : Predicate<"!Subtarget->getCPUv4_movsx()">;
+def BPFNoCPUv4_bswap : Predicate<"!Subtarget->getCPUv4_bswap()">;
 
 def brtarget : Operand<OtherVT> {
   let PrintMethod = "printBrTargetOperand";
@@ -240,18 +246,19 @@
 }
 
 // ALU instructions
-class ALU_RI<BPFOpClass Class, BPFArithOp Opc,
+class ALU_RI<BPFOpClass Class, BPFArithOp Opc, int off,
              dag outs, dag ins, string asmstr, list<dag> pattern>
     : TYPE_ALU_JMP<Opc.Value, BPF_K.Value, outs, ins, asmstr, pattern> {
   bits<4> dst;
   bits<32> imm;
 
   let Inst{51-48} = dst;
+  let Inst{47-32} = off;
   let Inst{31-0} = imm;
   let BPFClass = Class;
 }
 
-class ALU_RR<BPFOpClass Class, BPFArithOp Opc,
+class ALU_RR<BPFOpClass Class, BPFArithOp Opc, int off,
              dag outs, dag ins, string asmstr, list<dag> pattern>
     : TYPE_ALU_JMP<Opc.Value, BPF_X.Value, outs, ins, asmstr, pattern> {
   bits<4> dst;
@@ -259,26 +266,27 @@
 
   let Inst{55-52} = src;
   let Inst{51-48} = dst;
+  let Inst{47-32} = off;
   let BPFClass = Class;
 }
 
-multiclass ALU<BPFArithOp Opc, string OpcodeStr, SDNode OpNode> {
-  def _rr : ALU_RR<BPF_ALU64, Opc,
+multiclass ALU<BPFArithOp Opc, int off, string OpcodeStr, SDNode OpNode> {
+  def _rr : ALU_RR<BPF_ALU64, Opc, off,
                    (outs GPR:$dst),
                    (ins GPR:$src2, GPR:$src),
                    "$dst "#OpcodeStr#" $src",
                    [(set GPR:$dst, (OpNode i64:$src2, i64:$src))]>;
-  def _ri : ALU_RI<BPF_ALU64, Opc,
+  def _ri : ALU_RI<BPF_ALU64, Opc, off,
                    (outs GPR:$dst),
                    (ins GPR:$src2, i64imm:$imm),
                    "$dst "#OpcodeStr#" $imm",
                    [(set GPR:$dst, (OpNode GPR:$src2, i64immSExt32:$imm))]>;
-  def _rr_32 : ALU_RR<BPF_ALU, Opc,
+  def _rr_32 : ALU_RR<BPF_ALU, Opc, off,
                    (outs GPR32:$dst),
                    (ins GPR32:$src2, GPR32:$src),
                    "$dst "#OpcodeStr#" $src",
                    [(set GPR32:$dst, (OpNode i32:$src2, i32:$src))]>;
-  def _ri_32 : ALU_RI<BPF_ALU, Opc,
+  def _ri_32 : ALU_RI<BPF_ALU, Opc, off,
                    (outs GPR32:$dst),
                    (ins GPR32:$src2, i32imm:$imm),
                    "$dst "#OpcodeStr#" $imm",
@@ -287,18 +295,23 @@
 
 let Constraints = "$dst = $src2" in {
 let isAsCheapAsAMove = 1 in {
-  defm ADD : ALU<BPF_ADD, "+=", add>;
-  defm SUB : ALU<BPF_SUB, "-=", sub>;
-  defm OR  : ALU<BPF_OR, "|=", or>;
-  defm AND : ALU<BPF_AND, "&=", and>;
-  defm SLL : ALU<BPF_LSH, "<<=", shl>;
-  defm SRL : ALU<BPF_RSH, ">>=", srl>;
-  defm XOR : ALU<BPF_XOR, "^=", xor>;
-  defm SRA : ALU<BPF_ARSH, "s>>=", sra>;
-}
-  defm MUL : ALU<BPF_MUL, "*=", mul>;
-  defm DIV : ALU<BPF_DIV, "/=", udiv>;
-  defm MOD : ALU<BPF_MOD, "%=", urem>;
+  defm ADD : ALU<BPF_ADD, 0, "+=", add>;
+  defm SUB : ALU<BPF_SUB, 0, "-=", sub>;
+  defm OR  : ALU<BPF_OR, 0, "|=", or>;
+  defm AND : ALU<BPF_AND, 0, "&=", and>;
+  defm SLL : ALU<BPF_LSH, 0, "<<=", shl>;
+  defm SRL : ALU<BPF_RSH, 0, ">>=", srl>;
+  defm XOR : ALU<BPF_XOR, 0, "^=", xor>;
+  defm SRA : ALU<BPF_ARSH, 0, "s>>=", sra>;
+}
+  defm MUL : ALU<BPF_MUL, 0, "*=", mul>;
+  defm DIV : ALU<BPF_DIV, 0, "/=", udiv>;
+  defm MOD : ALU<BPF_MOD, 0, "%=", urem>;
+
+  let Predicates = [BPFHasCPUv4_sdiv] in {
+    defm SDIV : ALU<BPF_DIV, 1, "s/=", sdiv>;
+    defm SMOD : ALU<BPF_MOD, 1, "s%=", srem>;
+  }
 }
 
 class NEG_RR<BPFOpClass Class, BPFArithOp Opc,
@@ -338,26 +351,49 @@
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def LD_imm64 : LD_IMM64<0, "=">;
-def MOV_rr : ALU_RR<BPF_ALU64, BPF_MOV,
+def MOV_rr : ALU_RR<BPF_ALU64, BPF_MOV, 0,
                     (outs GPR:$dst),
                     (ins GPR:$src),
                     "$dst = $src",
                     []>;
-def MOV_ri : ALU_RI<BPF_ALU64, BPF_MOV,
+def MOV_ri : ALU_RI<BPF_ALU64, BPF_MOV, 0,
                     (outs GPR:$dst),
                     (ins i64imm:$imm),
                     "$dst = $imm",
                     [(set GPR:$dst, (i64 i64immSExt32:$imm))]>;
-def MOV_rr_32 : ALU_RR<BPF_ALU, BPF_MOV,
+def MOV_rr_32 : ALU_RR<BPF_ALU, BPF_MOV, 0,
                     (outs GPR32:$dst),
                     (ins GPR32:$src),
                     "$dst = $src",
                     []>;
-def MOV_ri_32 : ALU_RI<BPF_ALU, BPF_MOV,
+def MOV_ri_32 : ALU_RI<BPF_ALU, BPF_MOV, 0,
                     (outs GPR32:$dst),
                     (ins i32imm:$imm),
                     "$dst = $imm",
                     [(set GPR32:$dst, (i32 i32immSExt32:$imm))]>;
+
+let Predicates = [BPFHasCPUv4_movsx] in {
+  def MOVSX_rr_8 : ALU_RR<BPF_ALU64, BPF_MOV, 8,
+                      (outs GPR:$dst), (ins GPR:$src),
+                      "$dst = (s8)$src",
+                      [(set GPR:$dst, (sra (shl GPR:$src, (i64 56)), (i64 56)))]>;
+  def MOVSX_rr_16 : ALU_RR<BPF_ALU64, BPF_MOV, 16,
+                      (outs GPR:$dst), (ins GPR:$src),
+                      "$dst = (s16)$src",
+                      [(set GPR:$dst, (sra (shl GPR:$src, (i64 48)), (i64 48)))]>;
+  def MOVSX_rr_32 : ALU_RR<BPF_ALU64, BPF_MOV, 32,
+                      (outs GPR:$dst), (ins GPR32:$src),
+                      "$dst = (s32)$src",
+                      [(set GPR:$dst, (sext GPR32:$src))]>;
+  def MOVSX_rr_32_8 : ALU_RR<BPF_ALU, BPF_MOV, 8,
+                      (outs GPR32:$dst), (ins GPR32:$src),
+                      "$dst = (s8)$src",
+                      [(set GPR32:$dst, (sra (shl GPR32:$src, (i32 24)), (i32 24)))]>;
+  def MOVSX_rr_32_16 : ALU_RR<BPF_ALU, BPF_MOV, 16,
+                      (outs GPR32:$dst), (ins GPR32:$src),
+                      "$dst = (s16)$src",
+                      [(set GPR32:$dst, (sra (shl GPR32:$src, (i32 16)), (i32 16)))]>;
+}
 }
 
 def FI_ri
@@ -421,8 +457,8 @@
 def STD : STOREi64<BPF_DW, "u64", store>;
 
 // LOAD instructions
-class LOAD<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
-    : TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
+class LOAD<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr, list<dag> Pattern>
+    : TYPE_LD_ST<ModOp.Value, SizeOp.Value,
                  (outs GPR:$dst),
                  (ins MEMri:$addr),
                  "$dst = *("#OpcodeStr#" *)($addr)",
@@ -436,8 +472,8 @@
   let BPFClass = BPF_LDX;
 }
 
-class LOADi64<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
-    : LOAD<SizeOp, OpcodeStr, [(set i64:$dst, (OpNode ADDRri:$addr))]>;
+class LOADi64<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr, PatFrag OpNode>
+    : LOAD<SizeOp, ModOp, OpcodeStr, [(set i64:$dst, (OpNode ADDRri:$addr))]>;
 
 let isCodeGenOnly = 1 in {
   def CORE_MEM : TYPE_LD_ST<BPF_MEM.Value, BPF_W.Value,
@@ -451,7 +487,7 @@
                                   "$dst = core_alu32_mem($opcode, $src, $offset)",
                                   []>;
   let Constraints = "$dst = $src" in {
-    def CORE_SHIFT : ALU_RR<BPF_ALU64, BPF_LSH,
+    def CORE_SHIFT : ALU_RR<BPF_ALU64, BPF_LSH, 0,
                              (outs GPR:$dst),
                              (ins u64imm:$opcode, GPR:$src, u64imm:$offset),
                              "$dst = core_shift($opcode, $src, $offset)",
@@ -460,12 +496,18 @@
 }
 
 let Predicates = [BPFNoALU32] in {
-  def LDW : LOADi64<BPF_W, "u32", zextloadi32>;
-  def LDH : LOADi64<BPF_H, "u16", zextloadi16>;
-  def LDB : LOADi64<BPF_B, "u8", zextloadi8>;
+  def LDW : LOADi64<BPF_W, BPF_MEM, "u32", zextloadi32>;
+  def LDH : LOADi64<BPF_H, BPF_MEM, "u16", zextloadi16>;
+  def LDB : LOADi64<BPF_B, BPF_MEM, "u8", zextloadi8>;
+}
+
+let Predicates = [BPFHasCPUv4_ldsx] in {
+  def LDWSX : LOADi64<BPF_W, BPF_MEMSX, "s32", sextloadi32>;
+  def LDHSX : LOADi64<BPF_H, BPF_MEMSX, "s16", sextloadi16>;
+  def LDBSX : LOADi64<BPF_B, BPF_MEMSX, "s8",  sextloadi8>;
 }
 
-def LDD : LOADi64<BPF_DW, "u64", load>;
+def LDD : LOADi64<BPF_DW, BPF_MEM, "u64", load>;
 
 class BRANCH<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
     : TYPE_ALU_JMP<Opc.Value, BPF_K.Value,
@@ -479,6 +521,18 @@
   let BPFClass = BPF_JMP;
 }
 
+class BRANCH_LONG<BPFJumpOp Opc, string OpcodeStr, list<dag> Pattern>
+    : TYPE_ALU_JMP<Opc.Value, BPF_K.Value,
+                   (outs),
+                   (ins brtarget:$BrDst),
+                   !strconcat(OpcodeStr, " $BrDst"),
+                   Pattern> {
+  bits<32> BrDst;
+
+  let Inst{31-0} = BrDst;
+  let BPFClass = BPF_JMP32;
+}
+
 class CALL<string OpcodeStr>
     : TYPE_ALU_JMP<BPF_CALL.Value, BPF_K.Value,
                    (outs),
@@ -506,6 +560,7 @@
 // Jump always
 let isBranch = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1 in {
   def JMP : BRANCH<BPF_JA, "goto", [(br bb:$BrDst)]>;
+  def JMPL : BRANCH_LONG<BPF_JA, "gotol", []>;
 }
 
 // Jump and link
@@ -835,7 +890,7 @@
 }
 
 // bswap16, bswap32, bswap64
-class BSWAP<bits<32> SizeOp, string OpcodeStr, BPFSrcType SrcType, list<dag> Pattern>
+class BSWAP<BPFOpClass Class, bits<32> SizeOp, string OpcodeStr, BPFSrcType SrcType, list<dag> Pattern>
     : TYPE_ALU_JMP<BPF_END.Value, SrcType.Value,
                    (outs GPR:$dst),
                    (ins GPR:$src),
@@ -845,21 +900,29 @@
 
   let Inst{51-48} = dst;
   let Inst{31-0} = SizeOp;
-  let BPFClass = BPF_ALU;
+  let BPFClass = Class;
 }
 
 
 let Constraints = "$dst = $src" in {
+  let Predicates = [BPFHasCPUv4_bswap] in {
+    def BSWAP16 : BSWAP<BPF_ALU64, 16, "bswap16", BPF_TO_LE, [(set GPR:$dst, (srl (bswap GPR:$src), (i64 48)))]>;
+    def BSWAP32 : BSWAP<BPF_ALU64, 32, "bswap32", BPF_TO_LE, [(set GPR:$dst, (srl (bswap GPR:$src), (i64 32)))]>;
+    def BSWAP64 : BSWAP<BPF_ALU64, 64, "bswap64", BPF_TO_LE, [(set GPR:$dst, (bswap GPR:$src))]>;
+  }
+
+  let Predicates = [BPFNoCPUv4_bswap] in {
     let Predicates = [BPFIsLittleEndian] in {
-        def BE16 : BSWAP<16, "be16", BPF_TO_BE, [(set GPR:$dst, (srl (bswap GPR:$src), (i64 48)))]>;
-        def BE32 : BSWAP<32, "be32", BPF_TO_BE, [(set GPR:$dst, (srl (bswap GPR:$src), (i64 32)))]>;
-        def BE64 : BSWAP<64, "be64", BPF_TO_BE, [(set GPR:$dst, (bswap GPR:$src))]>;
+        def BE16 : BSWAP<BPF_ALU, 16, "be16", BPF_TO_BE, [(set GPR:$dst, (srl (bswap GPR:$src), (i64 48)))]>;
+        def BE32 : BSWAP<BPF_ALU, 32, "be32", BPF_TO_BE, [(set GPR:$dst, (srl (bswap GPR:$src), (i64 32)))]>;
+        def BE64 : BSWAP<BPF_ALU, 64, "be64", BPF_TO_BE, [(set GPR:$dst, (bswap GPR:$src))]>;
     }
     let Predicates = [BPFIsBigEndian] in {
-        def LE16 : BSWAP<16, "le16", BPF_TO_LE, [(set GPR:$dst, (srl (bswap GPR:$src), (i64 48)))]>;
-        def LE32 : BSWAP<32, "le32", BPF_TO_LE, [(set GPR:$dst, (srl (bswap GPR:$src), (i64 32)))]>;
-        def LE64 : BSWAP<64, "le64", BPF_TO_LE, [(set GPR:$dst, (bswap GPR:$src))]>;
+        def LE16 : BSWAP<BPF_ALU, 16, "le16", BPF_TO_LE, [(set GPR:$dst, (srl (bswap GPR:$src), (i64 48)))]>;
+        def LE32 : BSWAP<BPF_ALU, 32, "le32", BPF_TO_LE, [(set GPR:$dst, (srl (bswap GPR:$src), (i64 32)))]>;
+        def LE64 : BSWAP<BPF_ALU, 64, "le64", BPF_TO_LE, [(set GPR:$dst, (bswap GPR:$src))]>;
     }
+  }
 }
 
 let Defs = [R0, R1, R2, R3, R4, R5], Uses = [R6], hasSideEffects = 1,
@@ -898,13 +961,15 @@
 def LD_IND_W : LOAD_IND<BPF_W, "u32", int_bpf_load_word>;
 
 let isCodeGenOnly = 1 in {
-  def MOV_32_64 : ALU_RR<BPF_ALU, BPF_MOV,
+  def MOV_32_64 : ALU_RR<BPF_ALU, BPF_MOV, 0,
                          (outs GPR:$dst), (ins GPR32:$src),
                          "$dst = $src", []>;
 }
 
-def : Pat<(i64 (sext GPR32:$src)),
-          (SRA_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+let Predicates = [BPFNoCPUv4_movsx] in {
+  def : Pat<(i64 (sext GPR32:$src)),
+            (SRA_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+}
 
 def : Pat<(i64 (zext GPR32:$src)), (MOV_32_64 GPR32:$src)>;
 
@@ -940,8 +1005,8 @@
   def STB32 : STOREi32<BPF_B, "u8", truncstorei8>;
 }
 
-class LOAD32<BPFWidthModifer SizeOp, string OpcodeStr, list<dag> Pattern>
-    : TYPE_LD_ST<BPF_MEM.Value, SizeOp.Value,
+class LOAD32<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr, list<dag> Pattern>
+    : TYPE_LD_ST<ModOp.Value, SizeOp.Value,
                 (outs GPR32:$dst),
                 (ins MEMri:$addr),
                 "$dst = *("#OpcodeStr#" *)($addr)",
@@ -955,13 +1020,18 @@
   let BPFClass = BPF_LDX;
 }
 
-class LOADi32<BPFWidthModifer SizeOp, string OpcodeStr, PatFrag OpNode>
-    : LOAD32<SizeOp, OpcodeStr, [(set i32:$dst, (OpNode ADDRri:$addr))]>;
+class LOADi32<BPFWidthModifer SizeOp, BPFModeModifer ModOp, string OpcodeStr, PatFrag OpNode>
+    : LOAD32<SizeOp, ModOp, OpcodeStr, [(set i32:$dst, (OpNode ADDRri:$addr))]>;
 
 let Predicates = [BPFHasALU32], DecoderNamespace = "BPFALU32" in {
-  def LDW32 : LOADi32<BPF_W, "u32", load>;
-  def LDH32 : LOADi32<BPF_H, "u16", zextloadi16>;
-  def LDB32 : LOADi32<BPF_B, "u8", zextloadi8>;
+  def LDW32 : LOADi32<BPF_W, BPF_MEM, "u32", load>;
+  def LDH32 : LOADi32<BPF_H, BPF_MEM, "u16", zextloadi16>;
+  def LDB32 : LOADi32<BPF_B, BPF_MEM, "u8", zextloadi8>;
+}
+
+let Predicates = [BPFHasCPUv4_ldsx], DecoderNamespace = "BPFALU32" in {
+  def LDH32SX : LOADi32<BPF_H, BPF_MEMSX, "s16", sextloadi16>;
+  def LDB32SX : LOADi32<BPF_B, BPF_MEMSX, "s8", sextloadi8>;
 }
 
 let Predicates = [BPFHasALU32] in {
@@ -973,6 +1043,7 @@
             (STW32 (EXTRACT_SUBREG GPR:$src, sub_32), ADDRri:$dst)>;
   def : Pat<(i32 (extloadi8 ADDRri:$src)), (i32 (LDB32 ADDRri:$src))>;
   def : Pat<(i32 (extloadi16 ADDRri:$src)), (i32 (LDH32 ADDRri:$src))>;
+
   def : Pat<(i64 (zextloadi8  ADDRri:$src)),
             (SUBREG_TO_REG (i64 0), (LDB32 ADDRri:$src), sub_32)>;
   def : Pat<(i64 (zextloadi16 ADDRri:$src)),
diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include <set>
+#include <map>
 
 using namespace llvm;
 
@@ -301,7 +302,9 @@
 
   static char ID;
   MachineFunction *MF;
+  const BPFInstrInfo *TII;
   const TargetRegisterInfo *TRI;
+  bool IsCPUv4;
 
   BPFMIPreEmitPeephole() : MachineFunctionPass(ID) {
     initializeBPFMIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
@@ -311,7 +314,11 @@
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
+  bool in16BitRange(int Num);
   bool eliminateRedundantMov();
+  bool adjustBranch();
+
+  std::map<unsigned, unsigned> ReverseCondOpMap;
 
 public:
 
@@ -322,14 +329,20 @@
 
     initialize(MF);
 
-    return eliminateRedundantMov();
+    bool Changed;
+    Changed = eliminateRedundantMov();
+    if (IsCPUv4)
+      Changed = adjustBranch() || Changed;
+    return Changed;
   }
 };
 
 // Initialize class variables.
 void BPFMIPreEmitPeephole::initialize(MachineFunction &MFParm) {
   MF = &MFParm;
+  TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
   TRI = MF->getSubtarget<BPFSubtarget>().getRegisterInfo();
+  IsCPUv4 = MF->getSubtarget<BPFSubtarget>().getCPUv4_ja();
   LLVM_DEBUG(dbgs() << "*** BPF PreEmit peephole pass ***\n\n");
 }
 
@@ -374,6 +387,215 @@
   return Eliminated;
 }
 
+bool BPFMIPreEmitPeephole::in16BitRange(int Num) {
+  // Well, the cut-off is not precisely at 16bit range since
+  // new codes are added during the transformation. So let us
+  // a little bit conservative.
+  return Num >= (INT16_MIN >> 1) && Num <= (INT16_MAX >> 1);
+}
+
+// Before cpu=v4, only 16bit branch target offset (-0x8000 to 0x7fff)
+// is supported for both unconditional (JMP) and condition (JEQ, JSGT,
+// etc.) branches. In certain cases, e.g., full unrolling, the branch
+// target offset might exceed 16bit range. If this happens, the llvm
+// will generate incorrect code as the offset is truncated to 16bit.
+//
+// To fix this rare case, a new insn JMPL is introduced. This new
+// insn supports supports 32bit branch target offset. The compiler
+// does not use this insn during insn selection. Rather, BPF backend
+// will estimate the branch target offset and do JMP -> JMPL and
+// JEQ -> JEQ + JMPL conversion if the estimated branch target offset
+// is beyond 16bit.
+bool BPFMIPreEmitPeephole::adjustBranch() {
+  bool Changed = false;
+  int CurrNumInsns = 0;
+  std::map<MachineBasicBlock *, int> SoFarNumInsns;
+  std::map<MachineBasicBlock *, MachineBasicBlock *> FollowThroughBB;
+  std::vector<MachineBasicBlock *> MBBs;
+
+  MachineBasicBlock *PrevBB = nullptr;
+  for (MachineBasicBlock &MBB : *MF) {
+    // MBB.size() is the number of insns in this basic block, including some
+    // debug info, e.g., DEBUG_VALUE, so we may over-count a little bit.
+    // Typically we have way more normal insns than DEBUG_VALUE insns.
+    // Also, if we indeed need to convert conditional branch like JEQ to
+    // JEQ + JMPL, we actually introduced some new insns like below.
+    CurrNumInsns += (int)MBB.size();
+    SoFarNumInsns[&MBB] = CurrNumInsns;
+    if (PrevBB != nullptr)
+      FollowThroughBB[PrevBB] = &MBB;
+    PrevBB = &MBB;
+    // A list of original BBs to make later traveral easier.
+    MBBs.push_back(&MBB);
+  }
+  FollowThroughBB[PrevBB] = nullptr;
+
+  for (unsigned i = 0; i < MBBs.size(); i++) {
+    // We have four cases here:
+    //  (1). no terminator, simple follow through.
+    //  (2). jmp to another bb.
+    //  (3). conditional jmp to another bb or follow through.
+    //  (4). conditional jmp followed by an unconditional jmp.
+    MachineInstr *CondJmp = nullptr, *UncondJmp = nullptr;
+
+    MachineBasicBlock *MBB = MBBs[i];
+    for (MachineInstr &Term : MBB->terminators()) {
+      if (Term.isConditionalBranch()) {
+        assert(CondJmp == nullptr);
+        CondJmp = &Term;
+      } else if (Term.isUnconditionalBranch()) {
+        assert(UncondJmp == nullptr);
+        UncondJmp = &Term;
+      }
+    }
+
+    // (1). no terminator, simple follow through.
+    if (!CondJmp && !UncondJmp)
+      continue;
+
+    MachineBasicBlock *CondTargetBB, *JmpBB;
+    CurrNumInsns = SoFarNumInsns[MBB];
+
+    // (2). jmp to another bb.
+    if (!CondJmp && UncondJmp) {
+      JmpBB = UncondJmp->getOperand(0).getMBB();
+      if (in16BitRange(SoFarNumInsns[JmpBB] - JmpBB->size() - CurrNumInsns))
+        continue;
+
+      // replace this insn as a JMPL.
+      BuildMI(MBB, UncondJmp->getDebugLoc(), TII->get(BPF::JMPL)).addMBB(JmpBB);
+      UncondJmp->eraseFromParent();
+      Changed = true;
+      continue;
+    }
+
+    const BasicBlock *TermBB = MBB->getBasicBlock();
+    int Dist;
+
+    // (3). conditional jmp to another bb or follow through.
+    if (!UncondJmp) {
+      CondTargetBB = CondJmp->getOperand(2).getMBB();
+      MachineBasicBlock *FollowBB = FollowThroughBB[MBB];
+      Dist = SoFarNumInsns[CondTargetBB] - CondTargetBB->size() - CurrNumInsns;
+      if (in16BitRange(Dist))
+        continue;
+
+      // We have
+      //   B2: ...
+      //       if (cond) goto B5
+      //   B3: ...
+      // where B2 -> B5 is beyond 16bit range.
+      //
+      // We do not have 32bit cond jmp insn. So we try to do
+      // the following.
+      //   B2:     ...
+      //           if (cond) goto New_B1
+      //   New_B0  goto B3
+      //   New_B1: gotol B5
+      //   B3: ...
+      // Basically two new basic blocks are created.
+      MachineBasicBlock *New_B0 = MF->CreateMachineBasicBlock(TermBB);
+      MachineBasicBlock *New_B1 = MF->CreateMachineBasicBlock(TermBB);
+
+      // Insert New_B0 and New_B1 into function block list.
+      MachineFunction::iterator MBB_I  = ++MBB->getIterator();
+      MF->insert(MBB_I, New_B0);
+      MF->insert(MBB_I, New_B1);
+
+      // replace B2 cond jump
+      if (CondJmp->getOperand(1).isReg())
+        BuildMI(*MBB, MachineBasicBlock::iterator(*CondJmp), CondJmp->getDebugLoc(), TII->get(CondJmp->getOpcode()))
+            .addReg(CondJmp->getOperand(0).getReg())
+            .addReg(CondJmp->getOperand(1).getReg())
+            .addMBB(New_B1);
+      else
+        BuildMI(*MBB, MachineBasicBlock::iterator(*CondJmp), CondJmp->getDebugLoc(), TII->get(CondJmp->getOpcode()))
+            .addReg(CondJmp->getOperand(0).getReg())
+            .addImm(CondJmp->getOperand(1).getImm())
+            .addMBB(New_B1);
+
+      // it is possible that CondTargetBB and FollowBB are the same. But the
+      // above Dist checking should already filtered this case.
+      MBB->removeSuccessor(CondTargetBB);
+      MBB->removeSuccessor(FollowBB);
+      MBB->addSuccessor(New_B0);
+      MBB->addSuccessor(New_B1);
+
+      // Populate insns in New_B0 and New_B1.
+      BuildMI(New_B0, CondJmp->getDebugLoc(), TII->get(BPF::JMP)).addMBB(FollowBB);
+      BuildMI(New_B1, CondJmp->getDebugLoc(), TII->get(BPF::JMPL))
+          .addMBB(CondTargetBB);
+
+      New_B0->addSuccessor(FollowBB);
+      New_B1->addSuccessor(CondTargetBB);
+      CondJmp->eraseFromParent();
+      Changed = true;
+      continue;
+    }
+
+    //  (4). conditional jmp followed by an unconditional jmp.
+    CondTargetBB = CondJmp->getOperand(2).getMBB();
+    JmpBB = UncondJmp->getOperand(0).getMBB();
+
+    // We have
+    //   B2: ...
+    //       if (cond) goto B5
+    //       JMP B7
+    //   B3: ...
+    //
+    // If only B2->B5 is out of 16bit range, we can do
+    //   B2: ...
+    //       if (cond) goto new_B
+    //       JMP B7
+    //   New_B: gotol B5
+    //   B3: ...
+    //
+    // If only 'JMP B7' is out of 16bit range, we can replace
+    // 'JMP B7' with 'JMPL B7'.
+    //
+    // If both B2->B5 and 'JMP B7' is out of range, just do
+    // both the above transformations.
+    Dist = SoFarNumInsns[CondTargetBB] - CondTargetBB->size() - CurrNumInsns;
+    if (!in16BitRange(Dist)) {
+      MachineBasicBlock *New_B = MF->CreateMachineBasicBlock(TermBB);
+
+      // Insert New_B0 into function block list.
+      MF->insert(++MBB->getIterator(), New_B);
+
+      // replace B2 cond jump
+      if (CondJmp->getOperand(1).isReg())
+        BuildMI(*MBB, MachineBasicBlock::iterator(*CondJmp), CondJmp->getDebugLoc(), TII->get(CondJmp->getOpcode()))
+            .addReg(CondJmp->getOperand(0).getReg())
+            .addReg(CondJmp->getOperand(1).getReg())
+            .addMBB(New_B);
+      else
+        BuildMI(*MBB, MachineBasicBlock::iterator(*CondJmp), CondJmp->getDebugLoc(), TII->get(CondJmp->getOpcode()))
+            .addReg(CondJmp->getOperand(0).getReg())
+            .addImm(CondJmp->getOperand(1).getImm())
+            .addMBB(New_B);
+
+      if (CondTargetBB != JmpBB)
+        MBB->removeSuccessor(CondTargetBB);
+      MBB->addSuccessor(New_B);
+
+      // Populate insn in New_B.
+      BuildMI(New_B, CondJmp->getDebugLoc(), TII->get(BPF::JMPL)).addMBB(CondTargetBB);
+
+      New_B->addSuccessor(CondTargetBB);
+      CondJmp->eraseFromParent();
+      Changed = true;
+    }
+
+    if (!in16BitRange(SoFarNumInsns[JmpBB] - CurrNumInsns)) {
+      BuildMI(MBB, UncondJmp->getDebugLoc(), TII->get(BPF::JMPL)).addMBB(JmpBB);
+      UncondJmp->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
 } // end default namespace
 
 INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole",
diff --git a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
--- a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
+++ b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -96,7 +96,9 @@
 bool BPFMISimplifyPatchable::isLoadInst(unsigned Opcode) {
   return Opcode == BPF::LDD || Opcode == BPF::LDW || Opcode == BPF::LDH ||
          Opcode == BPF::LDB || Opcode == BPF::LDW32 || Opcode == BPF::LDH32 ||
-         Opcode == BPF::LDB32;
+         Opcode == BPF::LDB32 || Opcode == BPF::LDWSX || Opcode == BPF::LDHSX ||
+         Opcode == BPF::LDBSX || Opcode == BPF::LDH32SX ||
+         Opcode == BPF::LDB32SX;
 }
 
 void BPFMISimplifyPatchable::checkADDrr(MachineRegisterInfo *MRI,
@@ -119,7 +121,9 @@
     unsigned COREOp;
     if (Opcode == BPF::LDB || Opcode == BPF::LDH || Opcode == BPF::LDW ||
         Opcode == BPF::LDD || Opcode == BPF::STB || Opcode == BPF::STH ||
-        Opcode == BPF::STW || Opcode == BPF::STD)
+        Opcode == BPF::STW || Opcode == BPF::STD || Opcode == BPF::LDWSX ||
+        Opcode == BPF::LDHSX || Opcode == BPF::LDBSX || Opcode == BPF::LDH32SX ||
+        Opcode == BPF::LDB32SX)
       COREOp = BPF::CORE_MEM;
     else if (Opcode == BPF::LDB32 || Opcode == BPF::LDH32 ||
              Opcode == BPF::LDW32 || Opcode == BPF::STB32 ||
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.h b/llvm/lib/Target/BPF/BPFSubtarget.h
--- a/llvm/lib/Target/BPF/BPFSubtarget.h
+++ b/llvm/lib/Target/BPF/BPFSubtarget.h
@@ -56,6 +56,9 @@
   // whether we should enable MCAsmInfo DwarfUsesRelocationsAcrossSections
   bool UseDwarfRIS;
 
+  // whether cpu v4 insns are enabled.
+  bool CPUv4_ldsx, CPUv4_movsx, CPUv4_bswap, CPUv4_sdiv, CPUv4_ja;
+
 public:
   // This constructor initializes the data members to match that
   // of the specified triple.
@@ -71,6 +74,11 @@
   bool getHasJmp32() const { return HasJmp32; }
   bool getHasAlu32() const { return HasAlu32; }
   bool getUseDwarfRIS() const { return UseDwarfRIS; }
+  bool getCPUv4_ldsx() const { return CPUv4_ldsx; }
+  bool getCPUv4_movsx() const { return CPUv4_movsx; }
+  bool getCPUv4_bswap() const { return CPUv4_bswap; }
+  bool getCPUv4_sdiv() const { return CPUv4_sdiv; }
+  bool getCPUv4_ja() const { return CPUv4_ja; }
 
   const BPFInstrInfo *getInstrInfo() const override { return &InstrInfo; }
   const BPFFrameLowering *getFrameLowering() const override {
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp
--- a/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -23,6 +23,17 @@
 #define GET_SUBTARGETINFO_CTOR
 #include "BPFGenSubtargetInfo.inc"
 
+static cl::opt<bool> Disable_CPUv4_ldsx("disable-cpuv4-ldsx", cl::Hidden, cl::init(false),
+  cl::desc("Disable ldsx insns in cpuv4"));
+static cl::opt<bool> Disable_CPUv4_movsx("disable-cpuv4-movsx", cl::Hidden, cl::init(false),
+  cl::desc("Disable movsx insns in cpuv4"));
+static cl::opt<bool> Disable_CPUv4_bswap("disable-cpuv4-bswap", cl::Hidden, cl::init(false),
+  cl::desc("Disable bswap insns in cpuv4"));
+static cl::opt<bool> Disable_CPUv4_sdiv("disable-cpuv4-sdiv-smod", cl::Hidden, cl::init(false),
+  cl::desc("Disable sdiv/smod insns in cpuv4"));
+static cl::opt<bool> Disable_CPUv4_ja("disable-cpuv4-ja", cl::Hidden, cl::init(false),
+  cl::desc("Disable 32-bit offset ja insn in cpuv4"));
+
 void BPFSubtarget::anchor() {}
 
 BPFSubtarget &BPFSubtarget::initializeSubtargetDependencies(StringRef CPU,
@@ -38,6 +49,11 @@
   HasJmp32 = false;
   HasAlu32 = false;
   UseDwarfRIS = false;
+  CPUv4_ldsx = false;
+  CPUv4_movsx = false;
+  CPUv4_bswap = false;
+  CPUv4_sdiv = false;
+  CPUv4_ja = false;
 }
 
 void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
@@ -55,6 +71,17 @@
     HasAlu32 = true;
     return;
   }
+  if (CPU == "v4") {
+    HasJmpExt = true;
+    HasJmp32 = true;
+    HasAlu32 = true;
+    CPUv4_ldsx = !Disable_CPUv4_ldsx;
+    CPUv4_movsx = !Disable_CPUv4_movsx;
+    CPUv4_bswap = !Disable_CPUv4_bswap;
+    CPUv4_sdiv = !Disable_CPUv4_sdiv;
+    CPUv4_ja = !Disable_CPUv4_ja;
+    return;
+  }
 }
 
 BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -57,8 +57,7 @@
     BPF_ABS = 0x1,
     BPF_IND = 0x2,
     BPF_MEM = 0x3,
-    BPF_LEN = 0x4,
-    BPF_MSH = 0x5,
+    BPF_MEMSX = 0x4,
     BPF_ATOMIC = 0x6
   };
 
@@ -178,7 +177,7 @@
   uint8_t InstMode = getInstMode(Insn);
   if ((InstClass == BPF_LDX || InstClass == BPF_STX) &&
       getInstSize(Insn) != BPF_DW &&
-      (InstMode == BPF_MEM || InstMode == BPF_ATOMIC) &&
+      (InstMode == BPF_MEM || InstMode == BPF_MEMSX || InstMode == BPF_ATOMIC) &&
       STI.hasFeature(BPF::ALU32))
     Result = decodeInstruction(DecoderTableBPFALU3264, Instr, Insn, Address,
                                this, STI);
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -6,12 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/BPFMCFixups.h"
 #include "MCTargetDesc/BPFMCTargetDesc.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/Support/EndianStream.h"
 #include <cassert>
@@ -41,7 +43,10 @@
     return false;
   }
 
-  unsigned getNumFixupKinds() const override { return 1; }
+  unsigned getNumFixupKinds() const override {
+    return BPF::NumTargetFixupKinds;
+  }
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
   bool writeNopData(raw_ostream &OS, uint64_t Count,
                     const MCSubtargetInfo *STI) const override;
@@ -49,6 +54,20 @@
 
 } // end anonymous namespace
 
+const MCFixupKindInfo &
+BPFAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[BPF::NumTargetFixupKinds] = {
+    { "FK_BPF_PCRel_4",  0, 32, MCFixupKindInfo::FKF_IsPCRel },
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
 bool BPFAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count,
                                  const MCSubtargetInfo *STI) const {
   if ((Count % 8) != 0)
@@ -85,6 +104,11 @@
       Data[Fixup.getOffset() + 1] = 0x1;
       support::endian::write32be(&Data[Fixup.getOffset() + 4], Value);
     }
+  } else if (Fixup.getTargetKind() == BPF::FK_BPF_PCRel_4) {
+    // The input Value represents the number of bytes.
+    Value = (uint32_t)((Value - 8) / 8);
+    support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], Value,
+                                     Endian);
   } else {
     assert(Fixup.getKind() == FK_PCRel_2);
 
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+
+#include "BPF.h"
 #include "MCTargetDesc/BPFInstPrinter.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -100,8 +102,13 @@
                                        raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isImm()) {
-    int16_t Imm = Op.getImm();
-    O << ((Imm >= 0) ? "+" : "") << formatImm(Imm);
+    if (MI->getOpcode() == BPF::JMPL) {
+      int32_t Imm = Op.getImm();
+      O << ((Imm >= 0) ? "+" : "") << formatImm(Imm);
+    } else {
+      int16_t Imm = Op.getImm();
+      O << ((Imm >= 0) ? "+" : "") << formatImm(Imm);
+    }
   } else if (Op.isExpr()) {
     printExpr(Op.getExpr(), O);
   } else {
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCCodeEmitter.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/BPFMCFixups.h"
 #include "MCTargetDesc/BPFMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -95,6 +96,8 @@
     Fixups.push_back(MCFixup::create(0, Expr, FK_PCRel_4));
   else if (MI.getOpcode() == BPF::LD_imm64)
     Fixups.push_back(MCFixup::create(0, Expr, FK_SecRel_8));
+  else if (MI.getOpcode() == BPF::JMPL)
+    Fixups.push_back(MCFixup::create(0, Expr, (MCFixupKind)BPF::FK_BPF_PCRel_4));
   else
     // bb label
     Fixups.push_back(MCFixup::create(0, Expr, FK_PCRel_2));
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCFixups.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCFixups.h
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCFixups.h
@@ -0,0 +1,27 @@
+//=======-- BPFMCFixups.h - BPF-specific fixup entries ------*- C++ -*-=======//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_BPF_MCTARGETDESC_SYSTEMZMCFIXUPS_H
+#define LLVM_LIB_TARGET_BPF_MCTARGETDESC_SYSTEMZMCFIXUPS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace BPF {
+enum FixupKind {
+  // These correspond directly to R_390_* relocations.
+  FK_BPF_PCRel_4 = FirstTargetFixupKind,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // end namespace BPF
+} // end namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp
@@ -79,12 +79,15 @@
   bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
                       uint64_t &Target) const override {
     // The target is the 3rd operand of cond inst and the 1st of uncond inst.
-    int16_t Imm;
+    int32_t Imm;
     if (isConditionalBranch(Inst)) {
-      Imm = Inst.getOperand(2).getImm();
-    } else if (isUnconditionalBranch(Inst))
-      Imm = Inst.getOperand(0).getImm();
-    else
+      Imm = (short)Inst.getOperand(2).getImm();
+    } else if (isUnconditionalBranch(Inst)) {
+      if (Inst.getOpcode() == BPF::JMP)
+        Imm = (short)Inst.getOperand(0).getImm();
+      else
+        Imm = (int)Inst.getOperand(0).getImm();
+    } else
       return false;
 
     Target = Addr + Size + Imm * Size;
diff --git a/llvm/test/CodeGen/BPF/bswap.ll b/llvm/test/CodeGen/BPF/bswap.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/bswap.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=bpfel -mcpu=v4 -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; Source:
+;  long foo(int a, int b, long c) {
+;    a = __builtin_bswap16(a);
+;    b = __builtin_bswap32(b);
+;    c = __builtin_bswap64(c);
+;    return a + b + c;
+;  }
+; Compilation flags:
+;   clang -target bpf -O2 -S -emit-llvm t.c
+
+; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
+define dso_local i64 @foo(i32 noundef %a, i32 noundef %b, i64 noundef %c) local_unnamed_addr #0 {
+entry:
+  %conv = trunc i32 %a to i16
+  %0 = tail call i16 @llvm.bswap.i16(i16 %conv)
+  %conv1 = zext i16 %0 to i32
+  %1 = tail call i32 @llvm.bswap.i32(i32 %b)
+  %2 = tail call i64 @llvm.bswap.i64(i64 %c)
+  %add = add nsw i32 %1, %conv1
+  %conv2 = sext i32 %add to i64
+  %add3 = add nsw i64 %2, %conv2
+  ret i64 %add3
+}
+
+; CHECK: r1 = bswap16 r1                         # encoding: [0xd7,0x01,0x00,0x00,0x10,0x00,0x00,0x00]
+; CHECK: r2 = bswap32 r2                         # encoding: [0xd7,0x02,0x00,0x00,0x20,0x00,0x00,0x00]
+; CHECK: r0 = bswap64 r0                         # encoding: [0xd7,0x00,0x00,0x00,0x40,0x00,0x00,0x00]
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i16 @llvm.bswap.i16(i16) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.bswap.i32(i32) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.bswap.i64(i64) #1
+
+attributes #0 = { mustprogress nofree nosync nounwind willreturn memory(none) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project.git a2913a8a2bfe572d2f1bfea950ab9b0848373648)"}
diff --git a/llvm/test/CodeGen/BPF/ldsx.ll b/llvm/test/CodeGen/BPF/ldsx.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/ldsx.ll
@@ -0,0 +1,104 @@
+; RUN: llc -march=bpfel -mcpu=v4 -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; Source:
+;  int f1(char *p) {
+;    return *p;
+;  }
+;  int f2(short *p) {
+;    return *p;
+;  }
+;  int f3(int *p) {
+;    return *p;
+;  }
+;  long f4(char *p) {
+;    return *p;
+;  }
+;  long f5(short *p) {
+;    return *p;
+;  }
+;  long f6(int *p) {
+;    return *p;
+;  }
+;  long f7(long *p) {
+;    return *p;
+;  }
+; Compilation flags:
+;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes t.c
+
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind readonly willreturn
+define dso_local i32 @f1(ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
+entry:
+  %0 = load i8, ptr %p, align 1, !tbaa !3
+  %conv = sext i8 %0 to i32
+; CHECK:  w0 = *(s8 *)(r1 + 0)  # encoding: [0x91,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+  ret i32 %conv
+}
+
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind readonly willreturn
+define dso_local i32 @f2(ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
+entry:
+  %0 = load i16, ptr %p, align 2, !tbaa !6
+  %conv = sext i16 %0 to i32
+; CHECK:  w0 = *(s16 *)(r1 + 0)  # encoding: [0x89,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+  ret i32 %conv
+}
+
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind readonly willreturn
+define dso_local i32 @f3(ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
+entry:
+  %0 = load i32, ptr %p, align 4, !tbaa !8
+; CHECK:  w0 = *(u32 *)(r1 + 0)  # encoding: [0x61,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+  ret i32 %0
+}
+
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind readonly willreturn
+define dso_local i64 @f4(ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
+entry:
+  %0 = load i8, ptr %p, align 1, !tbaa !3
+  %conv = sext i8 %0 to i64
+  ret i64 %conv
+; CHECK:      r0 = *(s8 *)(r1 + 0)  # encoding: [0x91,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+}
+
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind readonly willreturn
+define dso_local i64 @f5(ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
+entry:
+  %0 = load i16, ptr %p, align 2, !tbaa !6
+  %conv = sext i16 %0 to i64
+  ret i64 %conv
+; CHECK:      r0 = *(s16 *)(r1 + 0)  # encoding: [0x89,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+}
+
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind readonly willreturn
+define dso_local i64 @f6(ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
+entry:
+  %0 = load i32, ptr %p, align 4, !tbaa !8
+  %conv = sext i32 %0 to i64
+  ret i64 %conv
+; CHECK:      r0 = *(s32 *)(r1 + 0)  # encoding: [0x81,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+}
+
+; Function Attrs: argmemonly mustprogress nofree norecurse nosync nounwind readonly willreturn
+define dso_local i64 @f7(ptr nocapture noundef readonly %p) local_unnamed_addr #0 {
+entry:
+  %0 = load i64, ptr %p, align 8, !tbaa !10
+  ret i64 %0
+; CHECK:      r0 = *(u64 *)(r1 + 0)  # encoding: [0x79,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+}
+
+attributes #0 = { argmemonly mustprogress nofree norecurse nosync nounwind readonly willreturn "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git 68665544c7d59735e9c0bb32b08829c006c7c594)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"short", !4, i64 0}
+!8 = !{!9, !9, i64 0}
+!9 = !{!"int", !4, i64 0}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"long", !4, i64 0}
diff --git a/llvm/test/CodeGen/BPF/movsx.ll b/llvm/test/CodeGen/BPF/movsx.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/movsx.ll
@@ -0,0 +1,79 @@
+; RUN: llc -march=bpfel -mcpu=v4 -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; Source:
+;   short f1(char a) {
+;     return a;
+;   }
+;   int f2(char a) {
+;     return a;
+;   }
+;   long f3(char a) {
+;     return a;
+;   }
+;   int f4(short a) {
+;     return a;
+;   }
+;   long f5(short a) {
+;     return a;
+;   }
+;   long f6(int a) {
+;     return a;
+;   }
+; Compilation flags:
+;   clang -target bpf -O2 -S -emit-llvm t.c
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+define dso_local i16 @f1(i8 noundef signext %a) local_unnamed_addr #0 {
+entry:
+  %conv = sext i8 %a to i16
+  ret i16 %conv
+}
+; CHECK: w0 = w1                                 # encoding: [0xbc,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+define dso_local i32 @f2(i8 noundef signext %a) local_unnamed_addr #0 {
+entry:
+  %conv = sext i8 %a to i32
+  ret i32 %conv
+}
+; CHECK: w0 = w1                                 # encoding: [0xbc,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+define dso_local i64 @f3(i8 noundef signext %a) local_unnamed_addr #0 {
+entry:
+  %conv = sext i8 %a to i64
+  ret i64 %conv
+}
+; CHECK: r0 = (s32)w1                            # encoding: [0xbf,0x10,0x20,0x00,0x00,0x00,0x00,0x00]
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+define dso_local i32 @f4(i16 noundef signext %a) local_unnamed_addr #0 {
+entry:
+  %conv = sext i16 %a to i32
+  ret i32 %conv
+}
+; CHECK: w0 = w1                                 # encoding: [0xbc,0x10,0x00,0x00,0x00,0x00,0x00,0x00]
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+define dso_local i64 @f5(i16 noundef signext %a) local_unnamed_addr #0 {
+entry:
+  %conv = sext i16 %a to i64
+  ret i64 %conv
+}
+; CHECK: r0 = (s32)w1                            # encoding: [0xbf,0x10,0x20,0x00,0x00,0x00,0x00,0x00]
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+define dso_local i64 @f6(i32 noundef %a) local_unnamed_addr #0 {
+entry:
+  %conv = sext i32 %a to i64
+  ret i64 %conv
+}
+; CHECK: r0 = (s32)w1                            # encoding: [0xbf,0x10,0x20,0x00,0x00,0x00,0x00,0x00]
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project.git a2913a8a2bfe572d2f1bfea950ab9b0848373648)"}
diff --git a/llvm/test/CodeGen/BPF/sdiv_smod.ll b/llvm/test/CodeGen/BPF/sdiv_smod.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/sdiv_smod.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=bpfel -mcpu=v4 -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; Source:
+;  int foo(int a, int b, int c) {
+;    return a/b + a%c;
+;  }
+;  long bar(long a, long b, long c) {
+;   return a/b + a%c;
+; }
+; Compilation flags:
+;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes t.c
+
+; Function Attrs: nounwind
+define dso_local i32 @foo(i32 noundef %a, i32 noundef %b, i32 noundef %c) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %c.addr = alloca i32, align 4
+  store i32 %a, ptr %a.addr, align 4, !tbaa !3
+  store i32 %b, ptr %b.addr, align 4, !tbaa !3
+  store i32 %c, ptr %c.addr, align 4, !tbaa !3
+  %0 = load i32, ptr %a.addr, align 4, !tbaa !3
+  %1 = load i32, ptr %b.addr, align 4, !tbaa !3
+  %div = sdiv i32 %0, %1
+  %2 = load i32, ptr %a.addr, align 4, !tbaa !3
+  %3 = load i32, ptr %c.addr, align 4, !tbaa !3
+  %rem = srem i32 %2, %3
+  %add = add nsw i32 %div, %rem
+  ret i32 %add
+}
+
+; CHECK:       w0 = w1
+; CHECK-NEXT:  *(u32 *)(r10 - 8) = w2
+; CHECK-NEXT:  *(u32 *)(r10 - 4) = w0
+; CHECK-NEXT:  *(u32 *)(r10 - 12) = w3
+; CHECK-NEXT:  w1 s%= w3  # encoding: [0x9c,0x31,0x01,0x00,0x00,0x00,0x00,0x00]
+; CHECK-NEXT:  w0 s/= w2  # encoding: [0x3c,0x20,0x01,0x00,0x00,0x00,0x00,0x00]
+
+; Function Attrs: nounwind
+define dso_local i64 @bar(i64 noundef %a, i64 noundef %b, i64 noundef %c) #0 {
+entry:
+  %a.addr = alloca i64, align 8
+  %b.addr = alloca i64, align 8
+  %c.addr = alloca i64, align 8
+  store i64 %a, ptr %a.addr, align 8, !tbaa !7
+  store i64 %b, ptr %b.addr, align 8, !tbaa !7
+  store i64 %c, ptr %c.addr, align 8, !tbaa !7
+  %0 = load i64, ptr %a.addr, align 8, !tbaa !7
+  %1 = load i64, ptr %b.addr, align 8, !tbaa !7
+  %div = sdiv i64 %0, %1
+  %2 = load i64, ptr %a.addr, align 8, !tbaa !7
+  %3 = load i64, ptr %c.addr, align 8, !tbaa !7
+  %rem = srem i64 %2, %3
+  %add = add nsw i64 %div, %rem
+  ret i64 %add
+}
+
+; CHECK:       r0 = r1
+; CHECK-NEXT:  *(u64 *)(r10 - 16) = r2
+; CHECK-NEXT:  *(u64 *)(r10 - 8) = r0
+; CHECK-NEXT:  *(u64 *)(r10 - 24) = r3
+; CHECK-NEXT:  r1 s%= r3  # encoding: [0x9f,0x31,0x01,0x00,0x00,0x00,0x00,0x00]
+; CHECK-NEXT:  r0 s/= r2  # encoding: [0x3f,0x20,0x01,0x00,0x00,0x00,0x00,0x00]
+
+attributes #0 = { nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project.git 569bd3b841e3167ddd7c6ceeddb282d3c280e761)"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"long", !5, i64 0}