diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -26,6 +26,7 @@ RISCVExpandPseudoInsts.cpp RISCVFrameLowering.cpp RISCVGatherScatterLowering.cpp + RISCVInsertNTLHInsts.cpp RISCVInsertVSETVLI.cpp RISCVInstrInfo.cpp RISCVISelDAGToDAG.cpp diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -62,6 +62,9 @@ FunctionPass *createRISCVExpandAtomicPseudoPass(); void initializeRISCVExpandAtomicPseudoPass(PassRegistry &); +FunctionPass *createRISCVInsertNTLHInstsPass(); +void initializeRISCVInsertNTLHInstsPass(PassRegistry &); + FunctionPass *createRISCVInsertVSETVLIPass(); void initializeRISCVInsertVSETVLIPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVInsertNTLHInsts.cpp @@ -0,0 +1,92 @@ +//===-- RISCVInsertNTLHInsts.cpp - Insert NTLH extension instrution -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a function pass that inserts non-temporal hint +// instructions where needed. +// +// It checks the MachineMemOperand of all MachineInstr. +// If the instruction has a MachineMemOperand and isNontemporal is true, +// then ntlh instruction is inserted before it. +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVInstrInfo.h" +#include "RISCVTargetMachine.h" + +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" + +using namespace llvm; + +#define RISCV_INSERT_NTLH_INSTS_NAME "RISC-V insert NTLH instruction pass" + +namespace { + +class RISCVInsertNTLHInsts : public MachineFunctionPass { +public: + const RISCVInstrInfo *TII; + static char ID; + + RISCVInsertNTLHInsts() : MachineFunctionPass(ID) { + initializeRISCVInsertNTLHInstsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return RISCV_INSERT_NTLH_INSTS_NAME; + } +}; + +} // end of anonymous namespace + +char RISCVInsertNTLHInsts::ID = 0; + +bool RISCVInsertNTLHInsts::runOnMachineFunction(MachineFunction &MF) { + const auto &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + + if (!ST.hasStdExtZihintntl()) + return false; + + bool Changed = false; + for (auto &MBB : MF) { + for (auto &MBBI : MBB) { + if (MBBI.memoperands_empty()) + continue; + MachineMemOperand *MMO = *(MBBI.memoperands_begin()); + if (MMO->isNonTemporal()) { + DebugLoc DL = MBBI.getDebugLoc(); + if (ST.hasStdExtCOrZca() && ST.enableRVCHintInstrs()) + BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoCNTLALL)); + else + BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoNTLALL)); + Changed = true; + } + } + } + + return Changed; +} + +INITIALIZE_PASS(RISCVInsertNTLHInsts, "riscv-insert-ntlh-insts", + RISCV_INSERT_NTLH_INSTS_NAME, false, false) + +namespace llvm { + +FunctionPass *createRISCVInsertNTLHInstsPass() { + return new RISCVInsertNTLHInsts(); +} + +} // end of namespace llvm diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -1243,6 +1243,20 @@ *TM.getMCAsmInfo()); } + if (!MI.memoperands_empty()) { + MachineMemOperand *MMO = *(MI.memoperands_begin()); + const MachineFunction &MF = *MI.getParent()->getParent(); + const auto &ST = MF.getSubtarget(); + if (ST.hasStdExtZihintntl() && MMO->isNonTemporal()) { + if (ST.hasStdExtCOrZca() && ST.enableRVCHintInstrs()) { + if (isCompressibleInst(MI, STI)) + return 4; // c.ntl.all + c.load/c.store + return 6; // c.ntl.all + load/store + } + return 8; // ntl.all + load/store + } + } + if (MI.getParent() && MI.getParent()->getParent()) { if (isCompressibleInst(MI, STI)) return 2; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1920,3 +1920,4 @@ include "RISCVInstrInfoXVentana.td" include "RISCVInstrInfoXTHead.td" +include "RISCVInstrInfoZihintntl.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZihintntl.td @@ -0,0 +1,22 @@ +//===RISCVInstrInfoZihintntl.td - 'Zihintntl' instructions -*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// This file describes the RISC-V instructions from Non-Temporal Locality +/// Hints extension document (zihintntl). +/// +//===----------------------------------------------------------------------===// + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 4 in { + def PseudoNTLALL : Pseudo<(outs), (ins), [], "ntl.all">, + PseudoInstExpansion<(ADD X0, X0, X5)>; +} + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 2 in { + def PseudoCNTLALL : Pseudo<(outs), (ins), [], "c.ntl.all">, + PseudoInstExpansion<(C_ADD_HINT X0, X0, X5)>; +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -83,6 +83,7 @@ initializeRISCVOptWInstrsPass(*PR); initializeRISCVPreRAExpandPseudoPass(*PR); initializeRISCVExpandPseudoPass(*PR); + initializeRISCVInsertNTLHInstsPass(*PR); initializeRISCVInsertVSETVLIPass(*PR); initializeRISCVDAGToDAGISelPass(*PR); initializeRISCVInitUndefPass(*PR); @@ -348,6 +349,7 @@ void RISCVPassConfig::addPreEmitPass2() { addPass(createRISCVExpandPseudoPass()); + addPass(createRISCVInsertNTLHInstsPass()); // Schedule the expansion of AMOs at the last possible moment, avoiding the // possibility for other passes to break the requirements for forward diff --git a/llvm/test/CodeGen/RISCV/O0-pipeline.ll b/llvm/test/CodeGen/RISCV/O0-pipeline.ll --- a/llvm/test/CodeGen/RISCV/O0-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O0-pipeline.ll @@ -64,6 +64,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Stack Frame Layout Analysis ; CHECK-NEXT: RISC-V pseudo instruction expansion pass +; CHECK-NEXT: RISC-V insert NTLH instruction pass ; CHECK-NEXT: RISC-V atomic pseudo instruction expansion pass ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -177,6 +177,7 @@ ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Stack Frame Layout Analysis ; CHECK-NEXT: RISC-V pseudo instruction expansion pass +; CHECK-NEXT: RISC-V insert NTLH instruction pass ; CHECK-NEXT: RISC-V atomic pseudo instruction expansion pass ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter diff --git a/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll b/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/nontemporal-scalable.ll @@ -0,0 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh,+v < %s | FileCheck %s -check-prefix=CHECK-RV64V +; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh,+v < %s | FileCheck %s -check-prefix=CHECK-RV32V + +define @test_nontemporal_load_nxv2i64(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_load_nxv2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vl2re64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_nxv2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vl2re64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0 + ret %1 +} + +define @test_nontemporal_load_nxv4i32(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_load_nxv4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vl2re32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_nxv4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vl2re32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0 + ret %1 +} + +define @test_nontemporal_load_nxv8i16(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_load_nxv8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vl2re16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_nxv8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vl2re16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0 + ret %1 +} + +define @test_nontemporal_load_nxv16i8(ptr %p) { +; CHECK-RV64V-LABEL: test_nontemporal_load_nxv16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vl2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_nxv16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vl2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + %1 = load , ptr %p, !nontemporal !0 + ret %1 +} + +define void @test_nontemporal_store_nxv2i64(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_store_nxv2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_nxv2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_nxv4i32(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_store_nxv4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_nxv4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_nxv8i16(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_store_nxv8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_nxv8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_nxv16i8(ptr %p, %v) { +; CHECK-RV64V-LABEL: test_nontemporal_store_nxv16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vs2r.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_nxv16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vs2r.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store %v, ptr %p, !nontemporal !0 + ret void +} + +!0 = !{i32 1} diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/nontemporal.ll @@ -0,0 +1,1441 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh < %s | FileCheck %s -check-prefix=CHECK-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh < %s | FileCheck %s -check-prefix=CHECK-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh,+c < %s | FileCheck %s -check-prefix=CHECK-RV64C +; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh,+c < %s | FileCheck %s -check-prefix=CHECK-RV32C +; RUN: llc -mtriple=riscv64 -mattr=+experimental-zihintntl,+f,+d,+zfh,+v < %s | FileCheck %s -check-prefix=CHECK-RV64V +; RUN: llc -mtriple=riscv32 -mattr=+experimental-zihintntl,+f,+d,+zfh,+v < %s | FileCheck %s -check-prefix=CHECK-RV32V + +define i64 @test_nontemporal_load_i64(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_load_i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a2, 0(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a1, 4(a0) +; CHECK-RV32-NEXT: mv a0, a2 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a2, 0(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a1, 4(a0) +; CHECK-RV32C-NEXT: mv a0, a2 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: ld a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lw a2, 0(a0) +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lw a1, 4(a0) +; CHECK-RV32V-NEXT: mv a0, a2 +; CHECK-RV32V-NEXT: ret + + %1 = load i64, ptr %p, !nontemporal !0 + ret i64 %1 +} + +define i32 @test_nontemporal_load_i32(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_load_i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: lw a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: lw a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: lw a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lw a0, 0(a0) +; CHECK-RV32V-NEXT: ret + + %1 = load i32, ptr %p, !nontemporal !0 + ret i32 %1 +} + +define i16 @test_nontemporal_load_i16(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_load_i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: lh a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lh a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: lh a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lh a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: lh a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lh a0, 0(a0) +; CHECK-RV32V-NEXT: ret + + %1 = load i16, ptr %p, !nontemporal !0 + ret i16 %1 +} + +define i8 @test_nontemporal_load_i8(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_load_i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: lbu a0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lbu a0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: lbu a0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lbu a0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: lbu a0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lbu a0, 0(a0) +; CHECK-RV32V-NEXT: ret + + %1 = load i8, ptr %p, !nontemporal !0 + ret i8 %1 +} + +define half @test_nontemporal_load_half(ptr %p) nounwind { +; CHECK-RV64-LABEL: test_nontemporal_load_half: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: flh fa5, 0(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: flh fa4, 6(a0) +; CHECK-RV64-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_half: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: flh fa5, 0(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: flh fa4, 6(a0) +; CHECK-RV32-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_half: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: flh fa5, 0(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: flh fa4, 6(a0) +; CHECK-RV64C-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_half: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: flh fa5, 0(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: flh fa4, 6(a0) +; CHECK-RV32C-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_half: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: flh fa5, 0(a0) +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: flh fa4, 6(a0) +; CHECK-RV64V-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_half: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: flh fa5, 0(a0) +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: flh fa4, 6(a0) +; CHECK-RV32V-NEXT: fadd.h fa0, fa5, fa4 +; CHECK-RV32V-NEXT: ret + + %1 = load half, ptr %p, !nontemporal !0 + %2 = getelementptr half, ptr %p, i32 3 + %3 = load half, ptr %2, !nontemporal !0 + %4 = fadd half %1, %3 + ret half %4 +} + +define float @test_nontemporal_load_float(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_load_float: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: flw fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_float: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: flw fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_float: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: flw fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_float: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: flw fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_float: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: flw fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_float: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: flw fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + + %1 = load float, ptr %p, !nontemporal !0 + ret float %1 +} + +define double @test_nontemporal_load_double(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_load_double: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: fld fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_double: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: fld fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_double: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: fld fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_double: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: fld fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_double: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: fld fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_double: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: fld fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + + %1 = load double, ptr %p, !nontemporal !0 + ret double %1 +} + +define <16 x i8> @test_nontemporal_load_v16i8(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_load_v16i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_v16i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_v16i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_v16i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_v16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vle8.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_v16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vle8.v v8, (a0) +; CHECK-RV32V-NEXT: ret + + %1 = load <16 x i8>, ptr %p, !nontemporal !0 + ret <16 x i8> %1 +} + +define <8 x i16> @test_nontemporal_load_v8i16(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_load_v8i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_v8i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_v8i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_v8i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_v8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vle16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_v8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vle16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + + %1 = load <8 x i16>, ptr %p, !nontemporal !0 + ret <8 x i16> %1 +} + +define <4 x i32> @test_nontemporal_load_v4i32(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_load_v4i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a2, 8(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a1, 0(a1) +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_v4i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_v4i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a2, 8(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a1, 0(a1) +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_v4i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_v4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vle32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_v4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vle32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + + %1 = load <4 x i32>, ptr %p, !nontemporal !0 + ret <4 x i32> %1 +} + +define <2 x i64> @test_nontemporal_load_v2i64(ptr %p) { +; CHECK-RV64-LABEL: test_nontemporal_load_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a2, 0(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a1, 8(a0) +; CHECK-RV64-NEXT: mv a0, a2 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_load_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_load_v2i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a2, 0(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a1, 8(a0) +; CHECK-RV64C-NEXT: mv a0, a2 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_load_v2i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_load_v2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vle64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_load_v2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vle64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + + %1 = load <2 x i64>, ptr %p, !nontemporal !0 + ret <2 x i64> %1 +} + +define void @test_nontemporal_store_i64(ptr %p, i64 %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a2, 4(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a2, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: sd a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: sw a2, 4(a0) +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: sw a1, 0(a0) +; CHECK-RV32V-NEXT: ret + + store i64 %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_i32(ptr %p, i32 %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sw a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: sw a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: sw a1, 0(a0) +; CHECK-RV32V-NEXT: ret + + store i32 %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_i16(ptr %p, i16 %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: sh a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: sh a1, 0(a0) +; CHECK-RV32V-NEXT: ret + + store i16 %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_i8(ptr %p, i8 %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: sb a1, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: sb a1, 0(a0) +; CHECK-RV32V-NEXT: ret + + store i8 %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_half(ptr %p, half %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_half: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: fsh fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_half: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: fsh fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_half: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: fsh fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_half: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: fsh fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_half: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: fsh fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_half: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: fsh fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + + store half %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_float(ptr %p, float %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_float: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: fsw fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_float: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: fsw fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_float: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: fsw fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_float: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: fsw fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_float: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: fsw fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_float: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: fsw fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + + store float %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_double(ptr %p, double %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_double: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: fsd fa0, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_double: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: fsd fa0, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_double: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: fsd fa0, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_double: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: fsd fa0, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_double: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: fsd fa0, 0(a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_double: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: fsd fa0, 0(a0) +; CHECK-RV32V-NEXT: ret + + store double %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_v16i8(ptr %p, <16 x i8> %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_v16i8: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: addi sp, sp, -16 +; CHECK-RV64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; CHECK-RV64-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; CHECK-RV64-NEXT: .cfi_offset s0, -8 +; CHECK-RV64-NEXT: .cfi_offset s1, -16 +; CHECK-RV64-NEXT: lbu a2, 0(a1) +; CHECK-RV64-NEXT: lbu a3, 8(a1) +; CHECK-RV64-NEXT: lbu a4, 16(a1) +; CHECK-RV64-NEXT: lbu a5, 24(a1) +; CHECK-RV64-NEXT: lbu a6, 32(a1) +; CHECK-RV64-NEXT: lbu a7, 40(a1) +; CHECK-RV64-NEXT: lbu t0, 48(a1) +; CHECK-RV64-NEXT: lbu t1, 56(a1) +; CHECK-RV64-NEXT: lbu t2, 64(a1) +; CHECK-RV64-NEXT: lbu t3, 72(a1) +; CHECK-RV64-NEXT: lbu t4, 80(a1) +; CHECK-RV64-NEXT: lbu t5, 88(a1) +; CHECK-RV64-NEXT: lbu t6, 120(a1) +; CHECK-RV64-NEXT: lbu s0, 112(a1) +; CHECK-RV64-NEXT: lbu s1, 104(a1) +; CHECK-RV64-NEXT: lbu a1, 96(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t6, 15(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb s0, 14(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb s1, 13(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a1, 12(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t5, 11(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t4, 10(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t3, 9(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t2, 8(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t1, 7(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb t0, 6(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a7, 5(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a6, 4(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a5, 3(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a4, 2(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a3, 1(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sb a2, 0(a0) +; CHECK-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; CHECK-RV64-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; CHECK-RV64-NEXT: addi sp, sp, 16 +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_v16i8: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: addi sp, sp, -16 +; CHECK-RV32-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-RV32-NEXT: .cfi_offset s0, -4 +; CHECK-RV32-NEXT: .cfi_offset s1, -8 +; CHECK-RV32-NEXT: lbu a2, 0(a1) +; CHECK-RV32-NEXT: lbu a3, 4(a1) +; CHECK-RV32-NEXT: lbu a4, 8(a1) +; CHECK-RV32-NEXT: lbu a5, 12(a1) +; CHECK-RV32-NEXT: lbu a6, 16(a1) +; CHECK-RV32-NEXT: lbu a7, 20(a1) +; CHECK-RV32-NEXT: lbu t0, 24(a1) +; CHECK-RV32-NEXT: lbu t1, 28(a1) +; CHECK-RV32-NEXT: lbu t2, 32(a1) +; CHECK-RV32-NEXT: lbu t3, 36(a1) +; CHECK-RV32-NEXT: lbu t4, 40(a1) +; CHECK-RV32-NEXT: lbu t5, 44(a1) +; CHECK-RV32-NEXT: lbu t6, 60(a1) +; CHECK-RV32-NEXT: lbu s0, 56(a1) +; CHECK-RV32-NEXT: lbu s1, 52(a1) +; CHECK-RV32-NEXT: lbu a1, 48(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t6, 15(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb s0, 14(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb s1, 13(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a1, 12(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t5, 11(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t4, 10(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t3, 9(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t2, 8(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t1, 7(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb t0, 6(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a7, 5(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a6, 4(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a5, 3(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a4, 2(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a3, 1(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sb a2, 0(a0) +; CHECK-RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-RV32-NEXT: addi sp, sp, 16 +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_v16i8: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: addi sp, sp, -16 +; CHECK-RV64C-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV64C-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; CHECK-RV64C-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; CHECK-RV64C-NEXT: .cfi_offset s0, -8 +; CHECK-RV64C-NEXT: .cfi_offset s1, -16 +; CHECK-RV64C-NEXT: lbu a6, 0(a1) +; CHECK-RV64C-NEXT: lbu a7, 8(a1) +; CHECK-RV64C-NEXT: lbu t0, 16(a1) +; CHECK-RV64C-NEXT: lbu t1, 24(a1) +; CHECK-RV64C-NEXT: lbu t2, 32(a1) +; CHECK-RV64C-NEXT: lbu t3, 40(a1) +; CHECK-RV64C-NEXT: lbu t4, 48(a1) +; CHECK-RV64C-NEXT: lbu t5, 56(a1) +; CHECK-RV64C-NEXT: lbu t6, 64(a1) +; CHECK-RV64C-NEXT: lbu a3, 72(a1) +; CHECK-RV64C-NEXT: lbu a4, 80(a1) +; CHECK-RV64C-NEXT: lbu a5, 88(a1) +; CHECK-RV64C-NEXT: lbu a2, 120(a1) +; CHECK-RV64C-NEXT: lbu s0, 112(a1) +; CHECK-RV64C-NEXT: lbu s1, 104(a1) +; CHECK-RV64C-NEXT: lbu a1, 96(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a2, 15(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb s0, 14(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb s1, 13(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a1, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a5, 11(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a4, 10(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a3, 9(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t6, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t5, 7(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t4, 6(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t3, 5(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t2, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t1, 3(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb t0, 2(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a7, 1(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sb a6, 0(a0) +; CHECK-RV64C-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; CHECK-RV64C-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; CHECK-RV64C-NEXT: addi sp, sp, 16 +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_v16i8: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: addi sp, sp, -16 +; CHECK-RV32C-NEXT: .cfi_def_cfa_offset 16 +; CHECK-RV32C-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; CHECK-RV32C-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; CHECK-RV32C-NEXT: .cfi_offset s0, -4 +; CHECK-RV32C-NEXT: .cfi_offset s1, -8 +; CHECK-RV32C-NEXT: lbu a6, 0(a1) +; CHECK-RV32C-NEXT: lbu a7, 4(a1) +; CHECK-RV32C-NEXT: lbu t0, 8(a1) +; CHECK-RV32C-NEXT: lbu t1, 12(a1) +; CHECK-RV32C-NEXT: lbu t2, 16(a1) +; CHECK-RV32C-NEXT: lbu t3, 20(a1) +; CHECK-RV32C-NEXT: lbu t4, 24(a1) +; CHECK-RV32C-NEXT: lbu t5, 28(a1) +; CHECK-RV32C-NEXT: lbu t6, 32(a1) +; CHECK-RV32C-NEXT: lbu a3, 36(a1) +; CHECK-RV32C-NEXT: lbu a4, 40(a1) +; CHECK-RV32C-NEXT: lbu a5, 44(a1) +; CHECK-RV32C-NEXT: lbu a2, 60(a1) +; CHECK-RV32C-NEXT: lbu s0, 56(a1) +; CHECK-RV32C-NEXT: lbu s1, 52(a1) +; CHECK-RV32C-NEXT: lbu a1, 48(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a2, 15(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb s0, 14(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb s1, 13(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a1, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a5, 11(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a4, 10(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a3, 9(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t6, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t5, 7(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t4, 6(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t3, 5(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t2, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t1, 3(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb t0, 2(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a7, 1(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sb a6, 0(a0) +; CHECK-RV32C-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; CHECK-RV32C-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; CHECK-RV32C-NEXT: addi sp, sp, 16 +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_v16i8: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vse8.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_v16i8: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vse8.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <16 x i8> %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_v8i16(ptr %p, <8 x i16> %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_v8i16: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lh a2, 0(a1) +; CHECK-RV64-NEXT: lh a3, 8(a1) +; CHECK-RV64-NEXT: lh a4, 16(a1) +; CHECK-RV64-NEXT: lh a5, 24(a1) +; CHECK-RV64-NEXT: lh a6, 56(a1) +; CHECK-RV64-NEXT: lh a7, 48(a1) +; CHECK-RV64-NEXT: lh t0, 40(a1) +; CHECK-RV64-NEXT: lh a1, 32(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a6, 14(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a7, 12(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh t0, 10(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a1, 8(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a5, 6(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a4, 4(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a3, 2(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sh a2, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_v8i16: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lh a2, 0(a1) +; CHECK-RV32-NEXT: lh a3, 4(a1) +; CHECK-RV32-NEXT: lh a4, 8(a1) +; CHECK-RV32-NEXT: lh a5, 12(a1) +; CHECK-RV32-NEXT: lh a6, 28(a1) +; CHECK-RV32-NEXT: lh a7, 24(a1) +; CHECK-RV32-NEXT: lh t0, 20(a1) +; CHECK-RV32-NEXT: lh a1, 16(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a6, 14(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a7, 12(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh t0, 10(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a1, 8(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a5, 6(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a4, 4(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a3, 2(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sh a2, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_v8i16: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lh a6, 0(a1) +; CHECK-RV64C-NEXT: lh a7, 8(a1) +; CHECK-RV64C-NEXT: lh t0, 16(a1) +; CHECK-RV64C-NEXT: lh a5, 24(a1) +; CHECK-RV64C-NEXT: lh a2, 56(a1) +; CHECK-RV64C-NEXT: lh a3, 48(a1) +; CHECK-RV64C-NEXT: lh a4, 40(a1) +; CHECK-RV64C-NEXT: lh a1, 32(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a2, 14(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a3, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a4, 10(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a1, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a5, 6(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh t0, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a7, 2(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sh a6, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_v8i16: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lh a6, 0(a1) +; CHECK-RV32C-NEXT: lh a7, 4(a1) +; CHECK-RV32C-NEXT: lh t0, 8(a1) +; CHECK-RV32C-NEXT: lh a5, 12(a1) +; CHECK-RV32C-NEXT: lh a2, 28(a1) +; CHECK-RV32C-NEXT: lh a3, 24(a1) +; CHECK-RV32C-NEXT: lh a4, 20(a1) +; CHECK-RV32C-NEXT: lh a1, 16(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a2, 14(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a3, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a4, 10(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a1, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a5, 6(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh t0, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a7, 2(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sh a6, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_v8i16: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vse16.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_v8i16: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vse16.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <8 x i16> %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_v4i32(ptr %p, <4 x i32> %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_v4i32: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: lw a2, 24(a1) +; CHECK-RV64-NEXT: lw a3, 16(a1) +; CHECK-RV64-NEXT: lw a4, 8(a1) +; CHECK-RV64-NEXT: lw a1, 0(a1) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sw a2, 12(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sw a3, 8(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sw a4, 4(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sw a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_v4i32: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_v4i32: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: lw a2, 24(a1) +; CHECK-RV64C-NEXT: lw a3, 16(a1) +; CHECK-RV64C-NEXT: lw a4, 8(a1) +; CHECK-RV64C-NEXT: lw a1, 0(a1) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sw a2, 12(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sw a3, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sw a4, 4(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sw a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_v4i32: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_v4i32: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vse32.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_v4i32: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vse32.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <4 x i32> %v, ptr %p, !nontemporal !0 + ret void +} + +define void @test_nontemporal_store_v2i64(ptr %p, <2 x i64> %v) { +; CHECK-RV64-LABEL: test_nontemporal_store_v2i64: +; CHECK-RV64: # %bb.0: +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sd a2, 8(a0) +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: sd a1, 0(a0) +; CHECK-RV64-NEXT: ret +; +; CHECK-RV32-LABEL: test_nontemporal_store_v2i64: +; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: lw a2, 12(a1) +; CHECK-RV32-NEXT: lw a3, 8(a1) +; CHECK-RV32-NEXT: lw a4, 4(a1) +; CHECK-RV32-NEXT: lw a1, 0(a1) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a2, 12(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a3, 8(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a4, 4(a0) +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: sw a1, 0(a0) +; CHECK-RV32-NEXT: ret +; +; CHECK-RV64C-LABEL: test_nontemporal_store_v2i64: +; CHECK-RV64C: # %bb.0: +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sd a2, 8(a0) +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: sd a1, 0(a0) +; CHECK-RV64C-NEXT: ret +; +; CHECK-RV32C-LABEL: test_nontemporal_store_v2i64: +; CHECK-RV32C: # %bb.0: +; CHECK-RV32C-NEXT: lw a2, 12(a1) +; CHECK-RV32C-NEXT: lw a3, 8(a1) +; CHECK-RV32C-NEXT: lw a4, 4(a1) +; CHECK-RV32C-NEXT: lw a1, 0(a1) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a2, 12(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a3, 8(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a4, 4(a0) +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: sw a1, 0(a0) +; CHECK-RV32C-NEXT: ret +; +; CHECK-RV64V-LABEL: test_nontemporal_store_v2i64: +; CHECK-RV64V: # %bb.0: +; CHECK-RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV64V-NEXT: ntl.all +; CHECK-RV64V-NEXT: vse64.v v8, (a0) +; CHECK-RV64V-NEXT: ret +; +; CHECK-RV32V-LABEL: test_nontemporal_store_v2i64: +; CHECK-RV32V: # %bb.0: +; CHECK-RV32V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: vse64.v v8, (a0) +; CHECK-RV32V-NEXT: ret + store <2 x i64> %v, ptr %p, !nontemporal !0 + ret void +} + +!0 = !{i32 1}