diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -26,6 +26,7 @@
   RISCVExpandPseudoInsts.cpp
   RISCVFrameLowering.cpp
   RISCVGatherScatterLowering.cpp
+  RISCVInsertNTLHInsts.cpp
   RISCVInsertVSETVLI.cpp
   RISCVInstrInfo.cpp
   RISCVISelDAGToDAG.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -65,6 +65,9 @@
 FunctionPass *createRISCVExpandAtomicPseudoPass();
 void initializeRISCVExpandAtomicPseudoPass(PassRegistry &);
 
+FunctionPass *createRISCVInsertNTLHInstsPass();
+void initializeRISCVInsertNTLHInstsPass(PassRegistry &);
+
 FunctionPass *createRISCVInsertVSETVLIPass();
 void initializeRISCVInsertVSETVLIPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp
@@ -0,0 +1,92 @@
+//===-- RISCVInsertNTLHInsts.cpp - Insert NTLH extension instrution -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function pass that inserts non-temporal hint
+// instructions where needed.
+//
+// It checks the MachineMemOperand of all MachineInstr.
+// If the instruction has a MachineMemOperand and isNontemporal is true,
+// then ntlh instruction is inserted before it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVInstrInfo.h"
+#include "RISCVTargetMachine.h"
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define RISCV_INSERT_NTLH_INSTS_NAME "RISCV insert NTLH instruction pass"
+
+namespace {
+
+class RISCVInsertNTLHInsts : public MachineFunctionPass {
+public:
+  const RISCVInstrInfo *TII;
+  static char ID;
+
+  RISCVInsertNTLHInsts() : MachineFunctionPass(ID) {
+    initializeRISCVInsertNTLHInstsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override {
+    return RISCV_INSERT_NTLH_INSTS_NAME;
+  }
+};
+
+} // end of anonymous namespace
+
+char RISCVInsertNTLHInsts::ID = 0;
+
+bool RISCVInsertNTLHInsts::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  TII = static_cast<const RISCVInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const auto &ST = MF.getSubtarget<RISCVSubtarget>();
+
+  if (!ST.hasStdExtZihintntl())
+    return false;
+
+  for (auto &MBB : MF) {
+    for (auto &MBBI : MBB) {
+      if (MBBI.memoperands_empty())
+        continue;
+      MachineMemOperand *MMO = *(MBBI.memoperands_begin());
+      if (MMO->isNonTemporal()) {
+        DebugLoc DL = MBBI.getDebugLoc();
+        if (ST.hasStdExtC() && ST.enableRVCHintInstrs())
+          BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoCNTLALL));
+        else
+          BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoNTLALL));
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+INITIALIZE_PASS(RISCVInsertNTLHInsts, "riscv-insert-ntlh-insts",
+                RISCV_INSERT_NTLH_INSTS_NAME, false, false)
+
+namespace llvm {
+
+FunctionPass *createRISCVInsertNTLHInstsPass() {
+  return new RISCVInsertNTLHInsts();
+}
+
+} // end of namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1204,6 +1204,20 @@
                               *TM.getMCAsmInfo());
   }
 
+  if (!MI.memoperands_empty()) {
+    MachineMemOperand *MMO = *(MI.memoperands_begin());
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const auto &ST = MF.getSubtarget<RISCVSubtarget>();
+    if (ST.hasStdExtZihintntl() && MMO->isNonTemporal()) {
+      if (ST.hasStdExtC() && ST.enableRVCHintInstrs()) {
+        if (isCompressibleInst(MI, STI))
+          return 4; // c.ntl.all + c.load/c.store
+        return 6;   // c.ntl.all + load/store
+      }
+      return 8; // ntl.all + load/store
+    }
+  }
+
   if (MI.getParent() && MI.getParent()->getParent()) {
     if (isCompressibleInst(MI, STI))
       return 2;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1891,3 +1891,4 @@
 
 include "RISCVInstrInfoXVentana.td"
 include "RISCVInstrInfoXTHead.td"
+include "RISCVInstrInfoZihintntl.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td
@@ -0,0 +1,22 @@
+//RISCVInstrInfoZihintntl.td - RISC-V 'Zihintntl' instructions -*- tablegen -*-//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file describes the RISC-V instructions from Non-Temporal Locality Hints 
+/// extension document (zihintntl).
+///
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 4, isCodeGenOnly = 1 in {
+  def PseudoNTLALL :  Pseudo<(outs), (ins), [], "ntl.all">, 
+                             PseudoInstExpansion<(ADD X0, X0, X5)>;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 2, isCodeGenOnly = 1 in {
+  def PseudoCNTLALL :  Pseudo<(outs), (ins), [], "c.ntl.all">,
+                              PseudoInstExpansion<(C_ADD_HINT X0, X0, X5)>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -80,6 +80,7 @@
   initializeRISCVStripWSuffixPass(*PR);
   initializeRISCVPreRAExpandPseudoPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
+  initializeRISCVInsertNTLHInstsPass(*PR);
   initializeRISCVInsertVSETVLIPass(*PR);
   initializeRISCVDAGToDAGISelPass(*PR);
 }
@@ -332,6 +333,7 @@
 
 void RISCVPassConfig::addPreEmitPass2() {
   addPass(createRISCVExpandPseudoPass());
+  addPass(createRISCVInsertNTLHInstsPass());
   // Schedule the expansion of AMOs at the last possible moment, avoiding the
   // possibility for other passes to break the requirements for forward
   // progress in the LR/SC block.
diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
--- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll
@@ -63,6 +63,7 @@
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Stack Frame Layout Analysis
 ; CHECK-NEXT:       RISCV pseudo instruction expansion pass
+; CHECK-NEXT:       RISCV insert NTLH instruction pass
 ; CHECK-NEXT:       RISCV atomic pseudo instruction expansion pass
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -174,6 +174,7 @@
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Stack Frame Layout Analysis
 ; CHECK-NEXT:       RISCV pseudo instruction expansion pass
+; CHECK-NEXT:       RISCV insert NTLH instruction pass
 ; CHECK-NEXT:       RISCV atomic pseudo instruction expansion pass
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
diff --git a/llvm/test/CodeGen/RISCV/nontemporal-c.ll b/llvm/test/CodeGen/RISCV/nontemporal-c.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/nontemporal-c.ll
@@ -0,0 +1,468 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh,+c < %s | FileCheck %s -check-prefix=CHECK-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh,+c < %s | FileCheck %s -check-prefix=CHECK-RV32
+
+define i64 @test_nontemporal_load_i64(i64* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a1, 4(a0)
+; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i64, i64* %p, align 8, !nontemporal !0
+  ret i64 %1
+}
+
+define i32 @test_nontemporal_load_i32(i32* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    lw a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i32, i32* %p, align 8, !nontemporal !0
+  ret i32 %1
+}
+
+define i16 @test_nontemporal_load_i16(i16* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    lh a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lh a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i16, i16* %p, align 8, !nontemporal !0
+  ret i16 %1
+}
+
+define i8 @test_nontemporal_load_i8(i8* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    lb a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lb a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i8, i8* %p, align 8, !nontemporal !0
+  ret i8 %1
+}
+
+define half @test_nontemporal_half(half *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    flh ft0, 0(a0)
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    flh ft1, 6(a0)
+; CHECK-RV64-NEXT:    fadd.h fa0, ft0, ft1
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    flh ft0, 0(a0)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    flh ft1, 6(a0)
+; CHECK-RV32-NEXT:    fadd.h fa0, ft0, ft1
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load half, half* %a, !nontemporal !0
+  %2 = getelementptr half, half* %a, i32 3
+  %3 = load half, half* %2, !nontemporal !0
+; Use both loaded values in an FP op to ensure an flh is used, even for the
+; soft half ABI
+  %4 = fadd half %1, %3
+  ret half %4
+}
+
+define float @test_nontemporal_load_float(float* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load float, float* %p, align 8, !nontemporal !0
+  ret float %1
+}
+
+define double @test_nontemporal_load_double(double* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load double, double* %p, align 8, !nontemporal !0
+  ret double %1
+}
+
+define dso_local i64 @test_nontemporal_load_unsigned_i8(i8 *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_load_unsigned_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    lbu a1, 4(a0)
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64-NEXT:    add a0, a0, a1
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_unsigned_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lbu a1, 4(a0)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    sltu a1, a0, a1
+; CHECK-RV32-NEXT:    ret
+
+  %1 = getelementptr i8, i8* %a, i32 4
+  %2 = load i8, i8* %1, !nontemporal !0
+  %3 = zext i8 %2 to i64
+  %4 = load volatile i8, i8* %a, !nontemporal !0
+  %5 = zext i8 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+define dso_local i32 @test_nontemporal_load_unsigned_i16(i16 *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_load_unsigned_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    lhu a1, 10(a0)
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    lhu a0, 0(a0)
+; CHECK-RV64-NEXT:    add a0, a0, a1
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_unsigned_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lhu a1, 10(a0)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lhu a0, 0(a0)
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    ret
+
+  %1 = getelementptr i16, i16* %a, i32 5
+  %2 = load i16, i16* %1, !nontemporal !0
+  %3 = zext i16 %2 to i32
+  %4 = load volatile i16, i16* %a, !nontemporal !0
+  %5 = zext i16 %4 to i32
+  %6 = add i32 %3, %5
+  ret i32 %6
+}
+
+define dso_local i64 @test_nontemporal_load_unsigned_i32(i32 *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_load_unsigned_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    lwu a1, 24(a0)
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    lwu a0, 0(a0)
+; CHECK-RV64-NEXT:    add a0, a0, a1
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_unsigned_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a1, 24(a0)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    add a0, a0, a1
+; CHECK-RV32-NEXT:    sltu a1, a0, a1
+; CHECK-RV32-NEXT:    ret
+
+  %1 = getelementptr i32, i32* %a, i32 6
+  %2 = load i32, i32* %1, !nontemporal !0
+  %3 = zext i32 %2 to i64
+  %4 = load volatile i32, i32* %a, !nontemporal !0
+  %5 = zext i32 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+define void @test_nontemporal_store_i64(i64* %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    sw a2, 4(a0)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i64 %v, i64* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_i32(i32* %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i32 %v, i32* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_i16(i16* %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    sh a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    sh a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i16 %v, i16* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_i8(i8* %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    sb a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    sb a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i8 %v, i8* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_half(half* %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store half %v, half* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_float(float* %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store float %v, float* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_double(double* %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store double %v, double* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define <16 x i8> @test_nontemporal_load_v16i8(<16 x i8>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <16 x i8>, <16 x i8>* %p, align 16, !nontemporal !0
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_load_v8i16(<8 x i16>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <8 x i16>, <8 x i16>* %p, align 16, !nontemporal !0
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_load_v4i32(<4 x i32>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <4 x i32>, <4 x i32>* %p, align 16, !nontemporal !0
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_load_v2i64(<2 x i64>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-RV64-NEXT:    c.ntl.all
+; CHECK-RV64-NEXT:    ld a1, 8(a0)
+; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    c.ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <2 x i64>, <2 x i64>* %p, align 16, !nontemporal !0
+  ret <2 x i64> %1
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/RISCV/nontemporal-vector.ll b/llvm/test/CodeGen/RISCV/nontemporal-vector.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/nontemporal-vector.ll
@@ -0,0 +1,422 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh,+v < %s | FileCheck %s -check-prefix=CHECK-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh,+v < %s | FileCheck %s -check-prefix=CHECK-RV32
+
+define i64 @test_nontemporal_load_i64(i64* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 4(a0)
+; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i64, i64* %p, align 8, !nontemporal !0
+  ret i64 %1
+}
+
+define i32 @test_nontemporal_load_i32(i32* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lw a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i32, i32* %p, align 8, !nontemporal !0
+  ret i32 %1
+}
+
+define i16 @test_nontemporal_load_i16(i16* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lh a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lh a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i16, i16* %p, align 8, !nontemporal !0
+  ret i16 %1
+}
+
+define i8 @test_nontemporal_load_i8(i8* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lb a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lb a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i8, i8* %p, align 8, !nontemporal !0
+  ret i8 %1
+}
+
+define half @test_nontemporal_half(half *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    flh ft0, 0(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    flh ft1, 6(a0)
+; CHECK-RV64-NEXT:    fadd.h fa0, ft0, ft1
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    flh ft0, 0(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    flh ft1, 6(a0)
+; CHECK-RV32-NEXT:    fadd.h fa0, ft0, ft1
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load half, half* %a, !nontemporal !0
+  %2 = getelementptr half, half* %a, i32 3
+  %3 = load half, half* %2, !nontemporal !0
+; Use both loaded values in an FP op to ensure an flh is used, even for the
+; soft half ABI
+  %4 = fadd half %1, %3
+  ret half %4
+}
+
+define float @test_nontemporal_load_float(float* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load float, float* %p, align 8, !nontemporal !0
+  ret float %1
+}
+
+define double @test_nontemporal_load_double(double* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load double, double* %p, align 8, !nontemporal !0
+  ret double %1
+}
+
+define dso_local i64 @test_nontemporal_load_unsigned_i8(i8 *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_load_unsigned_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lbu a1, 4(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64-NEXT:    add a0, a1, a0
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_unsigned_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lbu a1, 4(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    sltu a1, a0, a1
+; CHECK-RV32-NEXT:    ret
+
+  %1 = getelementptr i8, i8* %a, i32 4
+  %2 = load i8, i8* %1, !nontemporal !0
+  %3 = zext i8 %2 to i64
+  %4 = load volatile i8, i8* %a, !nontemporal !0
+  %5 = zext i8 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+define dso_local i32 @test_nontemporal_load_unsigned_i16(i16 *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_load_unsigned_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lhu a1, 10(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lhu a0, 0(a0)
+; CHECK-RV64-NEXT:    add a0, a1, a0
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_unsigned_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lhu a1, 10(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lhu a0, 0(a0)
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    ret
+
+  %1 = getelementptr i16, i16* %a, i32 5
+  %2 = load i16, i16* %1, !nontemporal !0
+  %3 = zext i16 %2 to i32
+  %4 = load volatile i16, i16* %a, !nontemporal !0
+  %5 = zext i16 %4 to i32
+  %6 = add i32 %3, %5
+  ret i32 %6
+}
+
+define dso_local i64 @test_nontemporal_load_unsigned_i32(i32 *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_load_unsigned_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lwu a1, 24(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lwu a0, 0(a0)
+; CHECK-RV64-NEXT:    add a0, a1, a0
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_unsigned_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 24(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    sltu a1, a0, a1
+; CHECK-RV32-NEXT:    ret
+
+  %1 = getelementptr i32, i32* %a, i32 6
+  %2 = load i32, i32* %1, !nontemporal !0
+  %3 = zext i32 %2 to i64
+  %4 = load volatile i32, i32* %a, !nontemporal !0
+  %5 = zext i32 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+define void @test_nontemporal_store_i64(i64* %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a2, 4(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i64 %v, i64* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_i32(i32* %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i32 %v, i32* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_i16(i16* %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i16 %v, i16* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_i8(i8* %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i8 %v, i8* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_half(half* %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store half %v, half* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_float(float* %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store float %v, float* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_double(double* %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store double %v, double* %p, align 8, !nontemporal !0
+  ret void
+}
+
+
+define <16 x i8> @test_nontemporal_load_v16i8(<16 x i8>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    vle8.v v8, (a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    vle8.v v8, (a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <16 x i8>, <16 x i8>* %p, align 16, !nontemporal !0
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_load_v8i16(<8 x i16>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    vle16.v v8, (a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    vle16.v v8, (a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <8 x i16>, <8 x i16>* %p, align 16, !nontemporal !0
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_load_v4i32(<4 x i32>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    vle32.v v8, (a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    vle32.v v8, (a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <4 x i32>, <4 x i32>* %p, align 16, !nontemporal !0
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_load_v2i64(<2 x i64>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    vle64.v v8, (a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    vle64.v v8, (a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <2 x i64>, <2 x i64>* %p, align 16, !nontemporal !0
+  ret <2 x i64> %1
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/nontemporal.ll
@@ -0,0 +1,469 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh < %s | FileCheck %s -check-prefix=CHECK-RV64
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh < %s | FileCheck %s -check-prefix=CHECK-RV32
+
+define i64 @test_nontemporal_load_i64(i64* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 4(a0)
+; CHECK-RV32-NEXT:    mv a0, a2
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i64, i64* %p, align 8, !nontemporal !0
+  ret i64 %1
+}
+
+define i32 @test_nontemporal_load_i32(i32* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lw a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i32, i32* %p, align 8, !nontemporal !0
+  ret i32 %1
+}
+
+define i16 @test_nontemporal_load_i16(i16* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lh a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lh a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i16, i16* %p, align 8, !nontemporal !0
+  ret i16 %1
+}
+
+define i8 @test_nontemporal_load_i8(i8* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lb a0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lb a0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load i8, i8* %p, align 8, !nontemporal !0
+  ret i8 %1
+}
+
+define half @test_nontemporal_half(half *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    flh ft0, 0(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    flh ft1, 6(a0)
+; CHECK-RV64-NEXT:    fadd.h fa0, ft0, ft1
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    flh ft0, 0(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    flh ft1, 6(a0)
+; CHECK-RV32-NEXT:    fadd.h fa0, ft0, ft1
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load half, half* %a, !nontemporal !0
+  %2 = getelementptr half, half* %a, i32 3
+  %3 = load half, half* %2, !nontemporal !0
+; Use both loaded values in an FP op to ensure an flh is used, even for the
+; soft half ABI
+  %4 = fadd half %1, %3
+  ret half %4
+}
+
+define float @test_nontemporal_load_float(float* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    flw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    flw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load float, float* %p, align 8, !nontemporal !0
+  ret float %1
+}
+
+define double @test_nontemporal_load_double(double* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fld fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fld fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load double, double* %p, align 8, !nontemporal !0
+  ret double %1
+}
+
+define dso_local i64 @test_nontemporal_load_unsigned_i8(i8 *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_load_unsigned_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lbu a1, 4(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lbu a0, 0(a0)
+; CHECK-RV64-NEXT:    add a0, a1, a0
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_unsigned_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lbu a1, 4(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lbu a0, 0(a0)
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    sltu a1, a0, a1
+; CHECK-RV32-NEXT:    ret
+
+  %1 = getelementptr i8, i8* %a, i32 4
+  %2 = load i8, i8* %1, !nontemporal !0
+  %3 = zext i8 %2 to i64
+  %4 = load volatile i8, i8* %a, !nontemporal !0
+  %5 = zext i8 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+define dso_local i32 @test_nontemporal_load_unsigned_i16(i16 *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_load_unsigned_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lhu a1, 10(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lhu a0, 0(a0)
+; CHECK-RV64-NEXT:    add a0, a1, a0
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_unsigned_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lhu a1, 10(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lhu a0, 0(a0)
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    ret
+
+  %1 = getelementptr i16, i16* %a, i32 5
+  %2 = load i16, i16* %1, !nontemporal !0
+  %3 = zext i16 %2 to i32
+  %4 = load volatile i16, i16* %a, !nontemporal !0
+  %5 = zext i16 %4 to i32
+  %6 = add i32 %3, %5
+  ret i32 %6
+}
+
+define dso_local i64 @test_nontemporal_load_unsigned_i32(i32 *%a) nounwind {
+; CHECK-RV64-LABEL: test_nontemporal_load_unsigned_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lwu a1, 24(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    lwu a0, 0(a0)
+; CHECK-RV64-NEXT:    add a0, a1, a0
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_unsigned_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 24(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a0, 0(a0)
+; CHECK-RV32-NEXT:    add a0, a1, a0
+; CHECK-RV32-NEXT:    sltu a1, a0, a1
+; CHECK-RV32-NEXT:    ret
+
+  %1 = getelementptr i32, i32* %a, i32 6
+  %2 = load i32, i32* %1, !nontemporal !0
+  %3 = zext i32 %2 to i64
+  %4 = load volatile i32, i32* %a, !nontemporal !0
+  %5 = zext i32 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+define void @test_nontemporal_store_i64(i64* %p, i64 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a2, 4(a0)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i64 %v, i64* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_i32(i32* %p, i32 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sw a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i32 %v, i32* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_i16(i16* %p, i16 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sh a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sh a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i16 %v, i16* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_i8(i8* %p, i8 %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    sb a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    sb a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store i8 %v, i8* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_half(half* %p, half %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_half:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_half:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fsh fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store half %v, half* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_float(float* %p, float %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_float:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_float:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fsw fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store float %v, float* %p, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_nontemporal_store_double(double* %p, double %v) {
+; CHECK-RV64-LABEL: test_nontemporal_store_double:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_store_double:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    fsd fa0, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  store double %v, double* %p, align 8, !nontemporal !0
+  ret void
+}
+
+
+define <16 x i8> @test_nontemporal_load_v16i8(<16 x i8>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v16i8:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <16 x i8>, <16 x i8>* %p, align 16, !nontemporal !0
+  ret <16 x i8> %1
+}
+
+define <8 x i16> @test_nontemporal_load_v8i16(<8 x i16>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v8i16:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <8 x i16>, <8 x i16>* %p, align 16, !nontemporal !0
+  ret <8 x i16> %1
+}
+
+define <4 x i32> @test_nontemporal_load_v4i32(<4 x i32>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a2, 8(a1)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a1, 0(a1)
+; CHECK-RV64-NEXT:    sd a2, 8(a0)
+; CHECK-RV64-NEXT:    sd a1, 0(a0)
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v4i32:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <4 x i32>, <4 x i32>* %p, align 16, !nontemporal !0
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @test_nontemporal_load_v2i64(<2 x i64>* %p) {
+; CHECK-RV64-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV64:       # %bb.0:
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-RV64-NEXT:    ntl.all
+; CHECK-RV64-NEXT:    ld a1, 8(a0)
+; CHECK-RV64-NEXT:    mv a0, a2
+; CHECK-RV64-NEXT:    ret
+;
+; CHECK-RV32-LABEL: test_nontemporal_load_v2i64:
+; CHECK-RV32:       # %bb.0:
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a2, 12(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a3, 8(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a4, 4(a1)
+; CHECK-RV32-NEXT:    ntl.all
+; CHECK-RV32-NEXT:    lw a1, 0(a1)
+; CHECK-RV32-NEXT:    sw a2, 12(a0)
+; CHECK-RV32-NEXT:    sw a3, 8(a0)
+; CHECK-RV32-NEXT:    sw a4, 4(a0)
+; CHECK-RV32-NEXT:    sw a1, 0(a0)
+; CHECK-RV32-NEXT:    ret
+
+  %1 = load <2 x i64>, <2 x i64>* %p, align 16, !nontemporal !0
+  ret <2 x i64> %1
+}
+
+!0 = !{i32 1}